Configuration¶
Basic Config File¶
The config file tells the hlink program what to link and how to link it. A description of the different sections of
a configuration file are below. For reference, here is an example of a relatively basic config file. This config file
is used by the examples/tutorial/tutorial.py
script for linking, and there is a more detailed discussion of the config
file in the README in examples/tutorial
.
Note that this config is written in TOML, but hlink is also able to work with JSON config files.
id_column = "id"
[datasource_a]
alias = "a"
file = "data/A.csv"
[datasource_b]
alias = "b"
file = "data/B.csv"
[[column_mappings]]
column_name = "NAMEFRST"
transforms = [
{type = "lowercase_strip"}
]
[[column_mappings]]
column_name = "NAMELAST"
transforms = [
{type = "lowercase_strip"}
]
[[column_mappings]]
column_name = "AGE"
transforms = [
{type = "add_to_a", value = 10}
]
[[column_mappings]]
column_name = "SEX"
[[blocking]]
column_name = "SEX"
[[blocking]]
column_name = "AGE_2"
dataset = "a"
derived_from = "AGE"
expand_length = 2
explode = true
[[comparison_features]]
alias = "NAMEFRST_JW"
column_name = "NAMEFRST"
comparison_type = "jaro_winkler"
[[comparison_features]]
alias = "NAMELAST_JW"
column_name = "NAMELAST"
comparison_type = "jaro_winkler"
[comparisons]
operator = "AND"
[comparisons.comp_a]
comparison_type = "threshold"
feature_name = "NAMEFRST_JW"
threshold = 0.79
[comparisons.comp_b]
comparison_type = "threshold"
feature_name = "NAMELAST_JW"
threshold = 0.84
Advanced Config File¶
Here is an example of a more complex config file that makes use of more of hlink’s features. It uses machine learning to probabilistically link the two datasets.
id_column = "histid"
drop_data_from_scored_matches = false
# --------- DATASOURCES --------------
[datasource_a]
alias = "us1900"
file = "/path/to/us1900m_usa.P.parquet"
[datasource_b]
alias = "us1910"
file = "/path/to/us1910m_usa.P.parquet"
# --------- FILTERS --------------
[[filter]]
expression = "NAMELAST is not null and NAMELAST != ''"
[[filter]]
training_data_subset = true
datasource = "a"
[[filter]]
expression = "age >= 5"
datasource = "b"
# --------- COLUMN MAPPINGS --------------
[[column_mappings]]
column_name = "serialp"
[[column_mappings]]
column_name = "sex"
[[column_mappings]]
column_name = "age"
[[column_mappings]]
column_name = "namelast"
[[column_mappings]]
alias = "namefrst_clean"
column_name = "namefrst"
transforms = [
{ type = "lowercase_strip" },
{ type = "rationalize_name_words" },
{ type = "remove_qmark_hyphen"},
{ type = "replace_apostrophe"},
{ type = "remove_suffixes", values = ["jr", "sr", "ii", "iii"] },
{ type = "remove_alternate_names"},
{ type = "condense_strip_whitespace"},
]
[[column_mappings]]
alias = "namefrst_split"
column_name = "namefrst_clean"
transforms = [ { type = "split" } ]
[[column_mappings]]
alias = "namefrst_std"
column_name = "namefrst_split"
transforms = [
{ type = "array_index", value = 0 }
]
[[column_mappings]]
alias = "bpl_orig"
column_name = "bpl"
transforms = [
{ type = "divide_by_int", value = 100 },
{ type = "get_floor" }
]
[[column_mappings]]
alias = "statefip"
column_name = "statefip_h"
[[column_mappings]]
column_name = "birthyr"
alias = "clean_birthyr"
[[column_mappings.transforms]]
type = "mapping"
mappings = {9999 = "", 1999 = ""}
output_type = "int"
[[column_mappings]]
alias = "relate_div_100"
column_name = "relate"
transforms = [
{ type = "divide_by_int", value = 100 },
{ type = "get_floor" }
]
# --------- SUBSTITUTIONS --------------
[[substitution_columns]]
column_name = "namefrst_std"
[[substitution_columns.substitutions]]
join_column = "sex"
join_value = "1"
substitution_file = "/path/to/name_std/male.csv"
[[substitution_columns.substitutions]]
join_column = "sex"
join_value = "2"
substitution_file = "/path/to/name_std/female.csv"
# --------- FEATURE SELECTIONS --------------
[[feature_selections]]
input_column = "clean_birthyr"
output_column = "replaced_birthyr"
condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
transform = "sql_condition"
[[feature_selections]]
input_column = "namelast"
output_column = "namelast_bigrams"
transform = "bigrams"
[[feature_selections]]
input_column = "bpl_orig"
output_column = "bpl_clean"
condition = "case when bpl_str == 'washington' and bpl2_str=='washington' then 53 when (bpl_str is null or bpl_str == '') and bpl2_str=='washington' then 53 when bpl_str == 'washington' and (bpl2_str=='' or bpl2_str is null) then 53 else bpl_orig end"
transform = "sql_condition"
[[feature_selections]]
input_column = "bpl_clean"
output_column = "region"
transform = "attach_variable"
region_dict = "/path/to/region.csv"
col_to_join_on = "bpl"
col_to_add = "region"
null_filler = 99
col_type = "float"
# --------- POTENTIAL MATCHES UNIVERSE -------------
[[potential_matches_universe]]
expression = "sex == 1"
# --------- BLOCKING --------------
[[blocking]]
column_name = "sex"
[[blocking]]
column_name = "birthyr_3"
dataset = "a"
derived_from = "replaced_birthyr"
expand_length = 3
explode = true
[[blocking]]
column_name = "namelast_bigrams"
explode = true
# --------- COMPARISONS --------------
[comparisons]
operator = "AND"
[comparisons.comp_a]
comparison_type = "threshold"
feature_name = "namefrst_std_jw"
threshold = 0.8
[comparisons.comp_b]
comparison_type = "threshold"
feature_name = "namelast_jw"
threshold = 0.75
# --------- HOUSEHOLD COMPARISIONS (post-blocking filters) -------------
[hh_comparisons]
comparison_type = "threshold"
feature_name = "byrdiff"
threshold_expr = "<= 10"
# --------- COMPARISON FEATURES --------------
[[comparison_features]]
alias = "region"
column_name = "region"
comparison_type = "fetch_a"
categorical = true
[[comparison_features]]
alias = "namefrst_std_jw"
column_name = "namefrst_std"
comparison_type = "jaro_winkler"
[[comparison_features]]
alias = "namelast_jw"
column_name = "namelast"
comparison_type = "jaro_winkler"
[[comparison_features]]
alias = "sex_equals"
column_name = "sex"
comparison_type = "equals"
categorical = true
[[comparison_features]]
alias = "relate_a"
column_name = "relate_div_100"
comparison_type = "fetch_a"
# --------- PIPELINE-GENERATED FEATURES ------------
[[pipeline_features]]
input_columns = ["sex_equals", "region"]
output_column = "sex_region_interaction"
transformer_type = "interaction"
[[pipeline_features]]
input_column = "relate_a"
output_column = "relatetype"
transformer_type = "bucketizer"
categorical = true
splits = [1,3,5,9999]
# --------- TRAINING --------------
[training]
independent_vars = [ "namelast_jw", "region", "hits", "sex_region_interaction", "relatetype"]
scale_data = false
dataset = "/path/to/training_data.csv"
dependent_var = "match"
score_with_model = true
use_training_data_features = false
split_by_id_a = true
decision = "drop_duplicate_with_threshold_ratio"
n_training_iterations = 2
output_suspicious_TD = true
param_grid = true
model_parameters = [
{ type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] },
{ type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] }
]
chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
# --------- HOUSEHOLD TRAINING --------------
[hh_training]
prediction_col = "prediction"
hh_col = "serialp"
independent_vars = ["namelast_jw", "namefrst_std_jw", "relatetype", "sex_equals"]
scale_data = false
dataset = "/path/to/hh_training_data_1900_1910.csv"
dependent_var = "match"
score_with_model = true
use_training_data_features = false
split_by_id_a = true
decision = "drop_duplicate_with_threshold_ratio"
n_training_iterations = 10
output_suspicious_TD = true
param_grid = false
model_parameters = [
{ type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 },
{ type = "probit", threshold = 0.5, threshold_ratio = 1.0 }
]
chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
Top level configs¶
These configs should go at the top of your config file under no header:
id_column
Required. Specify the id column that uniquely identifies a record in each dataset.
id_column = "id"
drop_data_from_scored_matches
Optional. Whether or not the scored potential matches should be output with full features data, or just ids and match information.
drop_data_from_scored_matches = false
Data sources¶
Header names:
datasource_a
,datasource_b
Description: Specifies your input data.
Required: True
Type: Object
Attributes:
alias
– Type:string
. The short name for the datasource. Must be alphanumeric with no spaces.file
– Type:string
. Required. The path to the input file. The file can becsv
orparquet
.convert_ints_to_longs
– Type:boolean
. Optional. If set to true, automatically convert each column with typeint
in the input file to typelong
. This can be especially useful when reading from CSV files, as Spark may assume that columns are typeint
when they should belong
. Parquet files have their own schema included in the file, so this may be less useful for them. Note that Spark also sometimes uses the termbigint
to mean the same thing aslong
.
[datasource_a]
alias = "us1900"
file = "/path/to/my_file.csv"
convert_ints_to_longs = true
Filter¶
Header name:
filter
Description: Specifies filters to apply to your input data.
Required: False
Type: List
Attributes:
expression
– Type:string
. SQL expression to apply to your input datasets. Can not be combined withtraining_data_subset
in a single filter.training_data_subset
– Type:boolean
. If set to true, will subset your input data to only include records that are also in your training data. Can not be combined withexpression
in a single filter.datasource
– Type:string
. If you want to limit the filter to operate only on dataset a or b, you can specify that with this attribute.
[[filter]]
training_data_subset = true
datasource = "a"
[[filter]]
expression = "NAMELAST is not null and NAMELAST != ''"
[[filter]]
expression = "age >= 5"
datasource = "b"
Column Mappings¶
Header name:
column_mappings
Description: Base column mappings and transformations to extract from your input datasets. Each column mapping requires a
column_name
which tells it which input column to read from. Optionally you may provide analias
for the column andtransforms
to modify it as it is read in. There are some additional attributes listed below that are meant for advanced usage. These are described in more detail on the column mappings page.Required: True
Type: List
Attributes:
column_name
– Type:string
. The name of the column in the input data.alias
– Type:string
. Optional. The new name of the column to use in hlink. By default, this is the same ascolumn_name
.transforms
– Type:List
. Optional. A list of transforms to apply, in order, to the input data. See the column mapping transforms section for more information.set_value_column_a
– Type:Any
. Optional. Set all records for dataset A to the given literal value.set_value_column_b
– Type:Any
. Optional. Set all records for dataset B to the given literal value.override_column_a
– Type:string
. Read from this column in dataset A instead of the column specified withcolumn_name
.override_column_b
– Type:string
. Read from this column in dataset B instead of the column specified withcolumn_name
.override_transforms
– Type:List
. Transforms to apply to the override column specified withoverride_column_a
oroverride_column_b
.
[[column_mappings]]
column_name = "age"
[[column_mappings]]
alias = "namefrst_clean"
column_name = "namefrst"
transforms = [
{ type = "lowercase_strip" },
{ type = "rationalize_name_words" },
{ type = "remove_qmark_hyphen"},
{ type = "replace_apostrophe"},
{ type = "remove_suffixes", values = ["jr", "sr", "ii", "iii", "iv", "v", "vi", "vii", "viii"] },
{ type = "remove_alternate_names"},
{ type = "condense_strip_whitespace"}
]
Substitution Columns¶
Header name:
substitution_columns
Description: Substitutions to apply to data after column mappings.
Required: False
Type: List
Attributes:
column_name
– Type:string
. Required. Column to apply substitutions to.substitutions
– Type:list
. A list of substitutions to apply. See the substitutions section for more information.
[[substitution_columns]]
column_name = "namefrst_std"
[[substitution_columns.substitutions]]
join_column = "sex"
join_value = "1"
substitution_file = "/path/to/name_std/male.csv"
[[substitution_columns.substitutions]]
join_column = "sex"
join_value = "2"
substitution_file = "/path/to/name_std/female.csv"
Feature Selections¶
Header name:
feature_selections
Description: A list of feature selections to apply to the input data after substitutions and column mappings. See the feature selection transforms section for more information, including information on the specific transforms available.
Required: False
Type: List
Attributes:
input_column
– Type:string
. Required. The name of the input column.output_column
– Type:string
. Required. The name of the output column.transform
– Type:string
. The name of the transform to apply to the column.Other attributes vary depending on transform type.
[[feature_selections]]
input_column = "namelast_clean"
output_column = "namelast_clean_bigrams"
transform = "bigrams"
[[feature_selections]]
input_column = "bpl_clean"
output_column = "region"
transform = "attach_variable"
region_dict = "/path/to/region.csv"
col_to_join_on = "bpl"
col_to_add = "region"
null_filler = 99
col_type = "float"
Potential Matches Universe¶
Header name:
potential_matches_universe
Description: Limits the universe of created potential matches generated using an expression fed to a SQL query.
Required: False
Type: List
Attributes:
expression
– Type:string
. Required. The expression to use to filter prepped_df_(a/b) before generating potential matches.
[[potential_matches_universe]]
# limits potential matches created to only men
expression = "sex == 1"
Blocking¶
Header name:
blocking
Description: Describes what columns to block on and how to create the blocks for the potential matches.
Required: True
Type: List
Attributes:
column_name
– Type:string
. Required. The name of the column in the existing data to block on if not exploded; The name of the newly exploded column ifexplode = true
.explode
– Type:boolean
. Optional. If true, will attempt to “explode” the column by creating duplicate rows for each value in the column. Only works on columns that are arrays of values or whenexpand_length
is set.dataset
– Type:string
. Optional. Must bea
orb
and used in conjuction withexplode
. Will only explode the column from thea
orb
dataset when specified.derived_from
– Type:string
. Used in conjunction withexplode = true
. Specifies an input column from the existing dataset to be exploded.expand_length
– Type:integer
. Whenexplode
is used on a column that is an integer, this can be specified to create an array with a range of integer values from (expand_length
minusoriginal_value
) to (expand_length
plusoriginal_value
). For example, if the input column value for birthyr is 1870, explode is true, and the expand_length is 3, the exploded column birthyr_3 value would be the array [1867, 1868, 1869, 1870, 1871, 1872, 1873].or_group
– Type:string
. Optional. The “OR group” to which this blocking table belongs. Blocking tables that belong to the same OR group are joined by OR in the blocking condition instead of AND. By default each blocking table belongs to a different OR group. For example, suppose that your dataset has 3 possible birthplaces BPL1, BPL2, and BPL3 for each record. If you don’t provide OR groups when blocking on each BPL variable, then you will get a blocking condition like(a.BPL1 = b.BPL1) AND (a.BPL2 = b.BPL2) AND (a.BPL3 = b.BPL3)
. But if you setor_group = "BPL"
for each of the 3 variables, then you will get a blocking condition like this instead:(a.BPL1 = b.BPL1 OR a.BPL2 = b.BPL2 OR a.BPL3 = b.BPL3)
. Note the parentheses around the entire OR group condition. Other OR groups would be connected to the BPL OR group with an AND condition.
[[blocking]]
column_name = "bpl"
[[blocking]]
column_name = "birthyr_3"
dataset = "a"
derived_from = "birthyr"
expand_length = 3
explode = true
Comparisons¶
Header name:
comparisons
Description: A set of comparisons which filter the potential matches. Only record pairs which satisfy the comparisons qualify as potential matches. See comparisons for some more information.
Required: True
Type: Object
There are two different forms that the comparisons table may take. It may either be a single comparison definition, or it may be a nested comparison definition with multiple sub-comparisons.
Single Comparison¶
Attributes:
comparison_type
– Type:string
. Required. The type of the comparison. Currently the only supported comparison type is"threshold"
, which compares a comparison feature to a given value.feature_name
– Type:string
. Required. The comparison feature to compare against.threshold
– Type:Any
. Optional. The value to compare against.threshold_expr
– Type:string
. Optional. A SQL condition which defines the comparison on the comparison feature named byfeature_name
.
The comparison definition must contain either threshold
or threshold_expr
,
but not both. Providing threshold = X
is equivalent to the threshold
expression threshold_expr >= X
.
# Only record pairs with namefrst_jw >= 0.79 can be
# potential matches.
[comparisons]
comparison_type = "threshold"
feature_name = "namefrst_jw"
threshold = 0.79
# Only record pairs with flag < 0.5 can be potential matches.
[comparisons]
comparison_type = "threshold"
feature_name = "flag"
threshold_expr = "< 0.5"
Multiple Comparisons¶
Attributes:
operator
– Type:string
. Required. The logical operator which connects the two sub-comparisons. May be"AND"
or"OR"
.comp_a
– Type:object
. Required. The first sub-comparison.comp_b
– Type:object
. Required. The second sub-comparison.
Both comp_a
and comp_b
are recursive comparison sections and may contain
either a single comparison or another set of sub-comparisons. Please see the
comparisons documentation for
more details and examples.
Household Comparisons¶
Header name:
hh_comparisons
Description: A set of comparisons which filter the household potential matches.
hh_comparisons
has the same configuration structure ascomparisons
and works in a similar way, except that it applies during thehh_matching
task instead ofmatching
. You can read more about comparisons here.
# Only household record pairs with an age difference <= 10 can be
# household potential matches.
[hh_comparisons]
comparison_type = "threshold"
feature_name = "byrdiff"
threshold_expr = "<= 10"
Comparison Features¶
Header name:
comparison_features
Description: A list of comparison features to create when comparing records. Comparisons for individual and household linking rounds are both represented here – no need to duplicate comparisons if used in both rounds, simply specify the
column_name
in the appropriatetraining
orhh_training
section of the config. See the comparison features documentation page for more information.Required: True
Type: List
Attributes:
alias
– Type:string
. Optional. The name of the comparison feature column to be generated. If not specified, the output column will default tocolumn_name
.column_name
– Type:string
. The name of the columns to compare.comparison_type
– Type:string
. The name of the comparison type to use.categorical
– Type:boolean
. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage).Other attributes may be included as well depending on
comparison_type
. See the comparison features page for details on each comparison type.
[[comparison_features]]
alias = "race"
column_name = "race"
comparison_type = "equals"
categorical = true
[[comparison_features]]
alias = "namefrst_jw"
column_name = "namefrst_unstd"
comparison_type = "jaro_winkler"
[[comparison_features]]
column_name = "durmarr"
comparison_type = "new_marr"
upper_threshold = 7
Pipeline-generated Features¶
Header name:
pipeline_features
Description: Features to be added in the model pipeline created for scoring a dataset. These features cannot be used in the
comparisons
section of the config and are for creating more robust ML models. They typically leverage code available in the Spark Pipeline API.Required: False
Type: List
Attributes:
transformer_type
– Type:string
. Required. See pipeline features for more information on the available transformer types.input_column
– Type:string
. Either useinput_column
orinput_columns
. Used if a single input_column is needed for the pipeline feature.input_columns
– Type: List of strings. Either useinput_column
orinput_columns
. Used if a list of input_columns is needed for the pipeline feature.output_column
– Type:string
. The name of the new pipeline feature column to be generated.categorical
– Type:boolean
. Optional. Whether the output data should be treated as categorical data (important information used during one-hot encoding and vectorizing in the machine learning pipeline stage).Other attributes may be included as well depending on the particular pipline feature
transformer_type
.
[[pipeline_features]]
input_columns = ["sex_equals", "regionf"]
output_column = "sex_regionf_interaction"
transformer_type = "interaction"
[[pipeline_features]]
input_column = "immyear_diff"
output_column = "immyear_caution"
transformer_type = "bucketizer"
categorical = true
splits = [-1,0,6,11,9999]
Training and models¶
Header name:
training
Description: Specifies the training data set as well as a myriad of attributes related to training a model including the dependent variable within that dataset, the independent variables created from the
comparison_features
section, and the different models you want to use for either model exploration or scoring.Required: False
Type: Object
Attributes:
dataset
– Type:string
. Location of the training dataset. Must be a csv file.dependent_var
– Type:string
. Name of dependent variable in training dataset.independent_vars
– Type:list
. List of independent variables to use in the model. These must be either part ofpipeline_features
orcomparison_features
.chosen_model
– Type:object
. The model to train with in thetraining
task and score with in thematching
task. See the models section for more information on model specifications.threshold
– Type:float
. The threshold for which to accept model probability values as true predictions. Can be used to specify a threshold to use for all models, or can be specified within eachchosen_model
andmodel_parameters
specification.decision
– Type:string
. Optional. Specifies which decision function to use to create the final prediction. The first option isdrop_duplicate_a
, which drops any links for which a record in thea
data set has a predicted match more than one time. The second option isdrop_duplicate_with_threshold_ratio
which only takes links for which thea
record has the highest probability out of any other potential links, and the second best link for thea
record is less than thethreshold_ratio
.threshold_ratio
– Type:float
. Optional. For use whendecision
isdrop_duplicate_with_threshold_ratio
. Specifies the smallest possible ratio to accept between a best and second best link for a given record. Can be used to specify a threshold ratio (beta threshold) to use for all models. Alternatively, unique threshold ratios can be specified in each individualchosen_model
andmodel_parameters
specification.model_parameters
– Type:list
. Specifies models to test out in themodel_exploration
task. See the models section for more information on model specifications.param_grid
– Type:boolean
. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in yourmodel_parameters
specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs.score_with_model
– Type:boolean
. If set to false, will skip theapply_model
step of the matching task. Use this if you want to use therun_all_steps
command and are just trying to generate potential links, such as for the creation of training data.n_training_iterations
– Type:integer
. Optional; default value is 10. The number of training iterations to use during themodel_exploration
task.scale_data
– Type:boolean
. Optional. Whether to scale the data as part of the machine learning pipeline.use_training_data_features
– Type:boolean
. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this totrue
, or training features will not be able to be generated, giving null column errors. For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set totrue
or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data. If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this tofalse
, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven’t changed, you could set it totrue
to save a small amount of processing time.output_suspicious_TD
– Type:boolean
. Optional. Used in themodel_exploration
link task. Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data. Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.split_by_id_a
– Type:boolean
. Optional. Used in themodel_exploration
link task. When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a “A304BT” has three potential matches in the training data, one each to histid_b “B200”, “C201”, and “D425”, all of those potential matches would either end up in the “train” split or the “test” split when evaluating the model performance.feature_importances
– Type:boolean
. Optional. Whether to record feature importances or coefficients for the training features when training the ML model. Set this to true to enable training step 3.
[training]
independent_vars = ["race", "srace", "race_interacted_srace", "hits", "hits2", "exact_mult", "ncount", "ncount2", "region", "namefrst_jw","namelast_jw","namefrst_std_jw","byrdiff", "f_interacted_jw_f", "jw_f", "f_caution", "f_pres", "fbplmatch", "m_interacted_jw_m", "jw_m", "m_caution", "m_pres", "mbplmatch", "sp_interacted_jw_sp", "jw_sp", "sp_caution", "sp_pres", "mi", "fsoundex", "lsoundex", "rel", "oth", "sgen", "nbors", "county_distance", "county_distance_squared", "street_jw", "imm_interacted_immyear_caution", "immyear_diff", "imm"]
scale_data = false
dataset = "/path/to/1900_1910_training_data_20191023.csv"
dependent_var = "match"
use_training_data_features = false
output_suspicious_TD = true
split_by_id_a = true
score_with_model = true
feature_importances = true
decision = "drop_duplicate_with_threshold_ratio"
n_training_iterations = 10
param_grid = false
model_parameters = [
{ type = "random_forest", maxDepth = 6, numTrees = 50 },
{ type = "probit", threshold = 0.5}
]
chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
Household training and models¶
Header name:
hh_training
Description: Specifies the household training data set as well as a myriad of attributes related to training a model including the dependent var within that data set, the independent vars created from the
comparison_features
section, and the different models you want to use.Required: False
Type: Object
Attributes:
All of the attributes and models available in training may also be used here.
prediction_col
– Type:string
. Required. The name of the column that the final prediction value is recorded in the individual linking round scoring step.hh_col
– Type:string
. Required. The name of the column with the household identifier.
[hh_training]
prediction_col = "prediction"
hh_col = "serialp"
independent_vars = ["namelast_jw","namefrst_jw","namefrst_std_jw", "jw_max_a", "jw_max_b", "f1_match", "f2_match", "byrdifcat", "racematch", "imm", "bplmatch", "imm_interacted_bplmatch", "sexmatch", "mardurmatch", "relatetype", "relatematch", "relatetype_interacted_relatematch"]
scale_data = false
dataset = "/path/to/hh_training_data_1900_1910.csv"
dependent_var = "match"
use_training_data_features = false
output_suspicious_TD = true
split_by_id_a = true
score_with_model = true
feature_importances = true
decision = "drop_duplicate_with_threshold_ratio"
param_grid = true
n_training_iterations = 10
model_parameters = [
{ type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]},
{ type = "random_forest", maxDepth = [5, 6, 7], numTrees = [50, 75, 100], threshold = [0.5], threshold_ratio = [1.0, 1.1, 1.2]}
]
chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }