rm(list=ls(all=t))
filename <- "bhsection3" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!! No Direct PII
# !!! No Direct PII-team
!!!Include relevant variables, but check their population size first to confirm they are <100,000
dropvars <- c("dise")
mydata <- mydata[!names(mydata) %in% dropvars]
locvars <- c("q006_block_id", "q007_vlg_id")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## q006_block_id. 6 Block Code
## 1 2 3 4 5 6 7 8 9 <NA>
## 196 159 201 413 99 200 147 426 517 33
## [1] "Frequency table after encoding"
## q006_block_id. 6 Block Code
## 279 280 281 282 283 284 285 286 287 <NA>
## 426 159 201 517 413 99 196 147 200 33
## [1] "Frequency table before encoding"
## q007_vlg_id. 7 Village Code
## 1 2 3 4 5 6 7 9 10 11 12 13 15 16 17 18 19 20
## 20 16 16 16 21 31 28 17 15 20 24 24 15 18 24 17 17 18
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
## 30 23 18 17 33 27 26 18 14 15 24 24 22 16 31 18 17 22
## 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
## 28 17 16 18 17 29 20 24 21 19 18 17 16 18 26 28 27 18
## 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 18 23 13 24 22 16 18 18 30 16 18 21 25 13 16 20 16 23
## 75 76 77 78 80 81 82 83 84 85 87 88 89 90 91 92 93 94
## 24 17 22 29 30 17 22 17 17 13 16 22 15 21 19 19 21 13
## 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 18 22 28 21 25 18 24 22 15 19 19 31 16 27 21 20 21 26
## 113 114 115 116 117 118 119 <NA>
## 14 24 19 16 21 22 16 33
## [1] "Frequency table after encoding"
## q007_vlg_id. 7 Village Code
## 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
## 18 25 25 17 22 18 16 20 17 23 16 20 18 21 24 29 17 15
## 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
## 17 22 22 18 28 18 31 16 20 16 18 17 21 16 20 16 23 19
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
## 22 15 19 16 17 21 20 21 28 27 14 22 17 17 24 27 29 18
## 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 18 16 28 24 22 17 30 13 15 18 30 21 15 16 26 23 24 18
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
## 18 19 18 17 16 26 21 13 30 22 28 17 19 22 15 14 13 13
## 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
## 18 24 16 33 24 21 31 16 24 17 18 19 26 27 18 16 31 16
## 373 374 375 376 377 378 379 <NA>
## 24 21 22 24 19 24 21 33
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
table(mydata$q302_age_mem)
##
## 3 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 25 26 35 42 47 50
## 1 5 8 5 8 3 10 7 18 19 21 34 27 20 20 2 4 2 6 3 1 1 1 1 1
mydata <- top_recode (variable="q302_age_mem", break_point=18, missing=NA)
## [1] "Frequency table before encoding"
## q302_age_mem. 302 How old is ?
## 3 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 1 5 8 5 8 3 10 7 18 19 21 34 27 20 20 2 4 2
## 22 25 26 35 42 47 50 <NA>
## 6 3 1 1 1 1 1 2163
## [1] "Frequency table after encoding"
## q302_age_mem. 302 How old is ?
## 3 5 6 7 8 9 10 11
## 1 5 8 5 8 3 10 7
## 12 13 14 15 16 17 18 or more <NA>
## 18 19 21 34 27 20 42 2163
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("q308_location", "q309_travel")
capture_tables (indirect_PII)
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('q302_age_mem', 'q303_gender') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
sdcInitial
## The input dataset consists of 2391 rows and 27 variables.
## --> Categorical key variables: q302_age_mem, q303_gender
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## q302_age_mem 16 (16) 15.200 (15.200) 1 (1)
## q303_gender 3 (3) 114.000 (114.000) 96 (96)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 3 (0.125%)
## - 3-anonymity: 9 (0.376%)
## - 5-anonymity: 30 (1.255%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 3 x 2
## q302_age_mem q303_gender
## <dbl+lbl> <dbl+lbl>
## 1 9 1 [Female]
## 2 3 0 [Male]
## 3 5 1 [Female]
# Three cases do not meet 2-anonimty by gender and age, but variables critical for analysism, so no further suppressions.
open_ends <- c("q308_location_othr", "q309_travel_othr")
report_open (list_open_ends = open_ends)
mydata <- mydata[!names(mydata) %in% open_ends] # Drop as actually verbatim data in local language
# !!! No GPS data
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)