rm(list=ls(all=t))
filename <- "Survey Data - Ecuador_Public Use" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No direct PII
# !!!No direct PII-team
# !!!No small locations
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
# Recode education attainment of adults to reduce risk of re-identification
# !!!No education attainment of adults.
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("b_v6", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_v6. How Many People, Including Yourself, Live In Your Household?
## 1 2 3 4 5 6 7 8 9 10 11 12 14 16 29 <NA>
## 2 23 72 107 179 108 64 32 17 13 6 6 1 1 1 44
## [1] "Frequency table after encoding"
## b_v6. How Many People, Including Yourself, Live In Your Household?
## 1 2 3 4 5 6 7 8
## 2 23 72 107 179 108 64 32
## 9 10 or more <NA>
## 17 28 44
mydata <- top_recode ("b_hhsize_2016_masked", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hhsize_2016_masked. Household Size
## 2 or less 3 4 5 6 7 8 9 10
## 25 72 107 179 108 64 32 17 13
## 11+ <NA>
## 15 44
## [1] "Frequency table after encoding"
## b_hhsize_2016_masked. Household Size
## 2 or less 3 4 5 6 7 8 9
## 25 72 107 179 108 64 32 17
## 10 or more <NA>
## 28 44
mydata <- top_recode ("b_hhsize_2016_imp", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hhsize_2016_imp. Imputed Version Of B_Hhsize_2016
## 1 2 3 4 5
## 2 23 72 107 179
## 5.40981006622314 6 7 8 9
## 44 108 64 32 17
## 10 11 12 14 16
## 13 6 6 1 1
## 29
## 1
## [1] "Frequency table after encoding"
## b_hhsize_2016_imp. Imputed Version Of B_Hhsize_2016
## 1 2 3 4 5
## 2 23 72 107 179
## 5.40981006622314 6 7 8 9
## 44 108 64 32 17
## 10 or more
## 28
# Top code high income to the 99.5 percentile
# No income variables to top code
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("age_at_base",
"female",
"b_v2",
"b_v3_masked",
"b_v4",
"b_v5",
"b_v7",
"b_v8",
"b_v9",
"b_v10",
"b_v11",
"b_v12",
"b_v13",
"b_v20_1",
"b_v20_2",
"b_v20_3",
"b_v20_4",
"b_v20_5",
"b_v20_6",
"b_v20_7",
"b_v20_8",
"b_v20x1_masked",
"b_v21_1_masked",
"b_v21_2_masked",
"b_v21_3_masked",
"b_v21_4_masked",
"b_v21_5_masked",
"b_v21_6_masked",
"b_v21_7_masked",
"b_v22_1",
"b_v22_2",
"b_v22_3",
"b_v22_4",
"b_v23",
"b_v24",
"b_v25_1",
"b_v25_2",
"b_v25_3",
"b_v25_4",
"b_v25_5",
"b_v25_6",
"b_v25_7",
"b_v25_8",
"b_v25_9",
"b_v25_10",
"b_v26_1_masked",
"b_v26_2_masked",
"b_v26_3_masked",
"b_v26_4_masked",
"b_v26_5_masked",
"b_v26_6_masked",
"b_v26_7_masked",
"b_v27_1",
"b_v27_2",
"b_v27_3",
"b_v27_4",
"b_v28_masked",
"b_v29_1",
"b_v29_2",
"b_v29_3",
"b_v29_4",
"b_v29_5",
"b_v29_6",
"b_v29_7",
"b_v29_8",
"b_v29_9",
"b_v29_10",
"b_v29_11",
"b_v29_13",
"b_v29_18",
"b_v29_19",
"b_v29_masked",
"b_v29_other",
"b_v30_1",
"b_v30_2",
"b_v30_3",
"b_v30_4",
"b_v30_5",
"b_v30_6",
"b_v30_7",
"b_v30_8",
"b_v30_9",
"b_v30_10",
"b_v30_11",
"b_v30_other",
"b_v31",
"b_v32",
"b_v32_other",
"b_v33",
"b_v34",
"b_v35_1",
"b_v35_2",
"b_v35_3",
"b_v35_4",
"b_v35_5",
"b_v35_6",
"b_v35_7",
"b_v35_8",
"b_v35_9",
"b_v35_10",
"b_v35_11",
"b_v35_12",
"b_v35_13",
"b_v35_14",
"b_v35x0",
"b_v36_1",
"b_v36_2",
"b_v36_3",
"b_v36_4",
"b_v36_5",
"b_v36_6",
"b_v36_7",
"b_v36_other",
"b_v37_1",
"b_v37_2",
"b_v37_3",
"b_v37_4",
"b_v37_5",
"b_v37_6",
"b_v37_7",
"b_v37_8",
"b_v37_9",
"b_v37_10",
"b_v37_11",
"b_v37_12",
"b_v37_13",
"b_v37_14",
"b_v37_15",
"b_v37_other",
"b_v39",
"b_v40",
"b_v46_masked",
"b_v47_masked",
"e_v7_a",
"e_v7_b",
"e_v7_c",
"e_v7_d",
"e_v7_e",
"e_v7_f",
"e_v7_g",
"e_v7_h",
"e_v7_other_masked",
"e_v8_1_masked",
"e_v8_2_masked",
"e_v8_3_masked",
"e_v8_4_masked",
"e_v8_5_masked",
"e_v8_6_masked",
"e_v8_7_masked",
"e_v9_a",
"e_v9_b",
"e_v9_c",
"e_v9_d",
"e_v10",
"e_v11_a",
"e_v11_b",
"e_v11_c",
"e_v11_d",
"e_v11_e",
"e_v11_f",
"e_v11_g",
"e_v11_h",
"e_v11_i",
"e_v11_j",
"e_v12_1_masked",
"e_v12_2_masked",
"e_v12_3_masked",
"e_v12_4_masked",
"e_v12_5_masked",
"e_v12_6_masked",
"e_v12_7_masked",
"e_v13_a",
"e_v13_b",
"e_v13_c",
"e_v13_d",
"e_v14_masked",
"e_v15_2",
"e_v15_3",
"e_v15_4",
"e_v15_5",
"e_v15_6",
"e_v15_7",
"e_v15_8",
"e_v15_9",
"e_v15_10",
"e_v15_11",
"e_v15_12",
"e_v15_13",
"e_v15_14",
"e_v15_15",
"e_v15_16",
"e_v15_masked",
"e_v15_other",
"e_v15_other_hazardous",
"e_v16_1",
"e_v16_2",
"e_v16_3",
"e_v16_4",
"e_v16_5",
"e_v16_6",
"e_v16_7",
"e_v16_8",
"e_v16_9",
"e_v16_10",
"e_v16_11",
"e_v16_other",
"e_v17",
"e_v18",
"e_v19",
"e_v20",
"e_v21_6meses",
"e_v21_a",
"e_v21_b",
"e_v21_c",
"e_v21_d",
"e_v21_e",
"e_v21_f",
"e_v21_g",
"e_v21_h",
"e_v21_i",
"e_v21_j",
"e_v21_k",
"e_v21_l",
"e_v21_m",
"e_v21_n",
"e_v22_a",
"e_v22_b",
"e_v22_c",
"e_v22_d",
"e_v22_e",
"e_v22_f",
"e_v22_g",
"e_v22_other",
"e_v23_a",
"e_v23_b",
"e_v23_c",
"e_v23_d",
"e_v23_e",
"e_v23_f",
"e_v23_g",
"e_v23_h",
"e_v23_i",
"e_v23_j",
"e_v23_k",
"e_v23_l",
"e_v23_m",
"e_v23_o_masked",
"e_v23_other",
"e_v25",
"e_v26",
"e_v31_masked",
"e_v32_masked",
"b_afro_2016",
"b_any_dustfumes_2016",
"b_any_harrassm_2016",
"b_any_injury_2016",
"b_casual2_2016",
"b_casual_2016",
"b_chorehrs_2016_masked",
"b_vulnerable2_2016",
"b_vulnerable_2016",
"b_fnat_2016",
"b_fspan_2016",
"b_gangcurr_2016",
"b_gangpast_2016",
"b_hcl_2016",
"b_hhchores_2016",
"b_hz_industry_2016",
"b_hz_occupation_2016",
"b_indig_2016",
"b_irregular_empl2_2016",
"b_irregular_empl_2016",
"b_long_hrs_2016",
"b_married_2016",
"b_mestizo_2016",
"b_mnat_2016",
"b_montub_2016",
"b_mspan_2016",
"b_night_work_2016",
"b_nightchores_2016",
"b_older_2016",
"b_othrace_2016",
"b_ownchild_2016",
"b_sameage_2016",
"b_temporary_noncasual2_2016",
"b_temporary_noncasual_2016",
"b_total_hrs_2016_masked",
"b_white_2016",
"b_working_2016",
"b_yearsout_2016",
"b_younger_2016",
"b_afro_2016_imp",
"b_any_dustfumes_2016_imp",
"b_any_harrassm_2016_imp",
"b_any_injury_2016_imp",
"b_chorehrs_2016_imp",
"b_fspan_2016_imp",
"b_gangcurr_2016_imp",
"b_hcl_2016_imp",
"b_hz_industry_2016_imp",
"b_hz_occupation_2016_imp",
"b_irregular_empl2_2016_imp",
"b_irregular_empl_2016_imp",
"b_long_hrs_2016_imp",
"b_married_2016_imp",
"b_mestizo_2016_imp",
"b_mspan_2016_imp",
"b_night_work_2016_imp",
"b_nightchores_2016_imp",
"b_older_2016_imp",
"b_othrace_2016_imp",
"b_ownchild_2016_imp",
"b_sameage_2016_imp",
"b_total_hrs_2016_imp",
"b_white_2016_imp",
"b_working_2016_imp",
"b_yearsout_2016_imp",
"b_younger_2016_imp",
"e_any_dustfumes_2017",
"e_any_harrassm_2017",
"e_any_injury_2017",
"e_casual_2017",
"e_chorehrs_2017_masked",
"e_gangcurr_2017",
"e_gangpast_2017",
"e_hcl_2017",
"e_hhchores_2017",
"e_hz_industry_2017",
"e_hz_occupation_2017",
"e_irregular_empl2_2017",
"e_irregular_empl_2017",
"e_long_hrs_2017",
"e_night_work_2017",
"e_nightchores_2017",
"e_shortcontract_2017",
"e_vulnerable_2017",
"e_temporary_noncasual_2017",
"e_total_hrs_2017_masked",
"e_working_2017")
capture_tables (indirect_PII)
# Recode those with very specific values.
mydata <- mydata[!names(mydata) %in% c("b_v31", "e_v17")] # Too detailed, open-ended verbatim in Spanish
# Recode races or ethnicity to reduce risk of re-identification
break_edu <- c(1,2,3,88,99)
labels_edu <- c("1. White" =1,
"2. Mestizo" = 2,
"3. Other" =3)
mydata <- ordinal_recode (variable="b_v9", break_points=break_edu, missing=8888, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## b_v9. What Is Your Race/Ethnicity?
## White Mestizo Afro, black, or mulatto
## 44 536 30
## Montubio Indigenous 88
## 7 18 2
## 99 <NA>
## 1 38
## recoded
## [1,2) [2,3) [3,88) [88,99) [99,8.89e+03)
## 1 44 0 0 0 0
## 2 0 536 0 0 0
## 3 0 0 30 0 0
## 4 0 0 7 0 0
## 5 0 0 18 0 0
## 88 0 0 0 2 0
## 99 0 0 0 0 1
## [1] "Frequency table after encoding"
## b_v9. What Is Your Race/Ethnicity?
## 1. White 2. Mestizo 3. Other 4 5 <NA>
## 44 536 55 2 1 38
## [1] "Inspect value labels and relabel as necessary"
## 1. White 2. Mestizo 3. Other
## 1 2 3
#Drop variables that shows the same information as b_v9.
dropvars <- c("b_afro_2016","b_indig_2016","b_montub_2016","b_othrace_2016", "b_afro_2016_imp","b_othrace_2016_imp")
mydata <- mydata[!names(mydata) %in% dropvars]
#Recode mother and father's native language to reduce risk of re-identification
break_edu <- c(1,2,4,5)
labels_edu <- c("1. Spanish" =1,
"2. Other" = 2,
"3. Not sure"=3,
"4. No response"=4)
mydata <- ordinal_recode (variable="b_v7", break_points=break_edu, missing=8888, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## b_v7. What Was Your Mother'S Mother Tongue?
## Spanish Native language Another language Not sure No response
## 569 25 2 40 2
## <NA>
## 38
## recoded
## [1,2) [2,4) [4,5) [5,8.89e+03)
## 1 569 0 0 0
## 2 0 25 0 0
## 3 0 2 0 0
## 4 0 0 40 0
## 99 0 0 0 2
## [1] "Frequency table after encoding"
## b_v7. What Was Your Mother'S Mother Tongue?
## 1. Spanish 2. Other 3. Not sure 4. No response <NA>
## 569 27 40 2 38
## [1] "Inspect value labels and relabel as necessary"
## 1. Spanish 2. Other 3. Not sure 4. No response
## 1 2 3 4
mydata <- ordinal_recode (variable="b_v8", break_points=break_edu, missing=8888, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## b_v8. What Was Your Father'S Mother Tongue?
## Spanish Native language Another language Not sure No response
## 539 23 3 71 2
## <NA>
## 38
## recoded
## [1,2) [2,4) [4,5) [5,8.89e+03)
## 1 539 0 0 0
## 2 0 23 0 0
## 3 0 3 0 0
## 4 0 0 71 0
## 99 0 0 0 2
## [1] "Frequency table after encoding"
## b_v8. What Was Your Father'S Mother Tongue?
## 1. Spanish 2. Other 3. Not sure 4. No response <NA>
## 539 26 71 2 38
## [1] "Inspect value labels and relabel as necessary"
## 1. Spanish 2. Other 3. Not sure 4. No response
## 1 2 3 4
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('age_at_base', 'female', 'b_v9') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 676 rows and 558 variables.
## --> Categorical key variables: age_at_base, female, b_v9
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## age_at_base 4 (4) 169.000 (169.000) 40 (40)
## female 2 (2) 338.000 (338.000) 260 (260)
## b_v9 6 (6) 127.600 (127.600) 1 (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 0 (0.000%)
## - 5-anonymity: 2 (0.296%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
!!! No Open-ends
# !!!No GPS data
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)