rm(list=ls(all=t))
filename <- "bhsection1_relabelled" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!! No Direct PII
# !!! No Direct PII-team
!!!Include relevant variables, but check their population size first to confirm they are <100,000
dropvars <- c("dise")
mydata <- mydata[!names(mydata) %in% dropvars]
locvars <- c("q006_block_id", "q007_vlg_id")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## q006_block_id. 6 Block Code
## 1 2 3 4 5 6 7 8 9 <NA>
## 194 155 195 407 98 190 143 422 516 33
## [1] "Frequency table after encoding"
## q006_block_id. 6 Block Code
## 279 280 281 282 283 284 285 286 287 <NA>
## 422 155 195 516 407 98 194 143 190 33
## [1] "Frequency table before encoding"
## q007_vlg_id. 7 Village Code
## 1 2 3 4 5 6 7 9 10 11 12 13 15 16 17 18 19
## 16 16 16 15 20 31 28 17 15 20 24 24 15 18 21 17 17
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 18 30 22 18 17 32 27 26 18 14 15 24 24 22 16 29 18
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 17 22 27 17 16 18 17 28 20 24 21 19 17 17 16 18 26
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
## 24 27 18 17 21 13 24 22 16 18 18 29 16 18 21 25 13
## 71 72 73 74 75 76 77 78 80 81 82 83 84 85 87 88 89
## 16 19 16 23 23 17 22 29 30 17 22 17 17 13 16 22 15
## 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 19 19 19 21 13 17 22 28 21 25 18 24 21 15 19 14 31
## 107 108 109 110 111 112 113 114 115 116 117 118 119 <NA>
## 16 27 21 17 21 26 14 24 19 16 21 22 16 33
## [1] "Frequency table after encoding"
## q007_vlg_id. 7 Village Code
## 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
## 18 25 25 17 22 18 16 20 17 21 16 16 18 20 24 28 17
## 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
## 15 17 22 22 18 24 17 31 16 19 16 18 17 21 15 20 16
## 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
## 23 19 22 15 14 16 17 19 17 21 27 27 14 22 17 17 24
## 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
## 27 29 18 18 16 28 24 22 17 29 13 15 17 30 21 15 16
## 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
## 26 22 24 18 18 19 18 17 16 26 21 13 30 22 28 17 19
## 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
## 21 15 14 13 13 18 21 16 32 24 21 29 16 24 17 18 19
## 367 368 369 370 371 372 373 374 375 376 377 378 379 <NA>
## 26 27 17 16 31 16 23 21 22 24 19 24 21 33
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata <- top_recode (variable="q106_own_land_bighas", break_point=50, missing=NA)
## [1] "Frequency table before encoding"
## q106_own_land_bighas. 106 Land owned as of day of survey (in Bighas)
## 0 1
## 417 243
## 2 3
## 278 154
## 4 5
## 104 170
## 6 7
## 49 58
## 8 9
## 54 14
## 10 11
## 102 6
## 12 13
## 35 4
## 14 15
## 2 36
## 16 17
## 5 2
## 20 21
## 36 1
## 22 23
## 3 1
## 25 28
## 15 1
## 30 31
## 27 1
## 35 38
## 6 1
## 40 43
## 14 1
## 45 47
## 2 1
## 50 54
## 9 1
## 60 65
## 6 1
## 70 80
## 2 3
## Don't Know Don't know Collectively Owned
## 304 175
## 100 102
## 2 2
## 111 150
## 1 2
## 300 400
## 1 1
## [1] "Frequency table after encoding"
## q106_own_land_bighas. 106 Land owned as of day of survey (in Bighas)
## 0 1 2 3 4 5 6
## 417 243 278 154 104 170 49
## 7 8 9 10 11 12 13
## 58 54 14 102 6 35 4
## 14 15 16 17 20 21 22
## 2 36 5 2 36 1 3
## 23 25 28 30 31 35 38
## 1 15 1 27 1 6 1
## 40 43 45 47 50 or more
## 14 1 2 1 510
mydata <- top_recode (variable="q107_culti_bighas", break_point=30, missing=NA)
## [1] "Frequency table before encoding"
## q107_culti_bighas. 106 Land cultiavted including orchard and planation as of day of survey (in Bigh
## 0 1
## 1015 183
## 2 3
## 115 56
## 4 5
## 43 43
## 6 7
## 19 18
## 8 9
## 10 5
## 10 11
## 22 2
## 12 14
## 7 2
## 15 20
## 7 7
## 22 23
## 2 1
## 25 27
## 1 1
## 30 40
## 11 5
## 54 60
## 1 1
## 65 80
## 1 1
## Don't Know Don't know Collectively Owned
## 569 200
## 100 150
## 3 1
## 400
## 1
## [1] "Frequency table after encoding"
## q107_culti_bighas. 106 Land cultiavted including orchard and planation as of day of survey (in Bigh
## 0 1 2 3 4 5 6
## 1015 183 115 56 43 43 19
## 7 8 9 10 11 12 14
## 18 10 5 22 2 7 2
## 15 20 22 23 25 27 30 or more
## 7 7 2 1 1 1 794
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("q101_religion",
"q102_language",
"q104_employment",
"q105_industry_id",
"q111c_aff_flood",
"q112c_res_flood",
"q111f_aff_plague",
"q112f_res_plague",
"q112g_res_layoff",
"q111j_othr_specify",
"q111j_aff_othr",
"q112j_res_othr",
"q114a_loss_emp",
"q114b_low_income",
"q114c_family_busines",
"q113g_fire",
"q114g_fire",
"q114h_robbery",
"q114k_loss_cash",
"q114l_fall_price",
"q114m_harvest",
"q113o_hh_prob",
"q114o_hh_prob")
capture_tables (indirect_PII)
# Recode those with very specific values.
break_lan <- c(1,2,14)
labels_lan <- c("Hindi" =1,
"Other" =2,
"Marwadi" =3)
mydata <- ordinal_recode (variable="q102_language", break_points=break_lan, missing=999999, value_labels=labels_lan)
## [1] "Frequency table before encoding"
## q102_language. 102 What language do you normally speak at home?
## Hindi Assamese Bangla Sindhi Punjabi Urdu Marwadi
## 207 1 1 1 2 3 2138
## recoded
## [1,2) [2,14) [14,1e+06)
## 1 207 0 0
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 12 0 2 0
## 13 0 3 0
## 14 0 0 2138
## [1] "Frequency table after encoding"
## q102_language. 102 What language do you normally speak at home?
## Hindi Other Marwadi
## 207 8 2138
## [1] "Inspect value labels and relabel as necessary"
## Hindi Other Marwadi
## 1 2 3
break_rel <- c(1,2,3)
labels_rel <- c("Hinduism" =1,
"Islam" =2,
"Other" =3)
mydata <- ordinal_recode (variable="q101_religion", break_points=break_rel, missing=999999, value_labels=labels_rel)
## [1] "Frequency table before encoding"
## q101_religion. 101 What is your religion?
## Hinduism Islam Christianity
## 1845 506 2
## recoded
## [1,2) [2,3) [3,1e+06)
## 1 1845 0 0
## 2 0 506 0
## 3 0 0 2
## [1] "Frequency table after encoding"
## q101_religion. 101 What is your religion?
## Hinduism Islam Other
## 1845 506 2
## [1] "Inspect value labels and relabel as necessary"
## Hinduism Islam Other
## 1 2 3
mydata <- mydata[!names(mydata) %in% "q105_industry_id"] # Drop as strong identifier
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('q101_religion', 'q103_social_grp', 'q104_employment') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
sdcInitial
## The input dataset consists of 2353 rows and 73 variables.
## --> Categorical key variables: q101_religion, q103_social_grp, q104_employment
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## q101_religion 3 (3) 784.333 (784.333) 2
## q103_social_grp 5 (5) 584.250 (584.250) 145
## q104_employment 7 (7) 336.143 (336.143) 3
##
## (2)
## (145)
## (3)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 7 (0.297%)
## - 3-anonymity: 17 (0.722%)
## - 5-anonymity: 24 (1.020%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 7 x 3
## q101_religion q103_social_grp q104_employment
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [Islam] 3 [Other Backward Class/Caste] 6 [Animal husbandary]
## 2 2 [Islam] 3 [Other Backward Class/Caste] 99 [Unclear information]
## 3 1 [Hinduism] 2 [Scheduled Caste] 6 [Animal husbandary]
## 4 3 [Other] 4 [General Class/Caste] 3 [Regular wage / salary earning]
## 5 2 [Islam] 1 [Scheduled Tribe] 2 [Self employed nonagricultural]
## 6 2 [Islam] 2 [Scheduled Caste] 4 [Casual labor in agriculture]
## 7 3 [Other] 4 [General Class/Caste] 5 [Casual labor in non-agriculture]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## q101_religion q103_social_grp q104_employment
## 435 2 3 NA
## 478 2 3 NA
## 772 1 2 NA
## 853 3 4 NA
## 1255 2 1 NA
## 1283 2 2 NA
## 2274 NA 4 5
mydata [notAnon,"q104_employment"] <- NA
createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
## The input dataset consists of 2353 rows and 73 variables.
## --> Categorical key variables: q101_religion, q103_social_grp, q104_employment
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## q101_religion 3 (3) 784.333 (784.333) 2
## q103_social_grp 5 (5) 584.250 (584.250) 145
## q104_employment 8 (8) 335.143 (335.143) 2
##
## (2)
## (145)
## (2)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 8 (0.340%)
## - 5-anonymity: 17 (0.722%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("q104_employment_othr", "q113o_other_prob_specify")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata <- mydata[!names(mydata) %in% c("q104_employment_othr",
"q113o_other_prob_specify")] # Drop as actually verbatim data in local language
# !!! No GPS data
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)