rm(list=ls(all=t))
filename <- "Round3_demo" # !!!Update filename
source ("functions_1.5.R")
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No direct PII variables
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal ashish.shrestha
## 79 96 89 98 82
## bhanu.bhakta.dhakal dev.raj.nepal dhan.kumari.darlami gita.maharjan kamala.sharma
## 77 2 85 99 78
## manjula.giri min.kumari.shrestha nabina.khadka niraj.shrestha pramila.shrestha
## 99 86 80 85 77
## pratika.shrestha rabischandra.bhatta ram.kumar.acharya sajina.shrestha sandip.shrestha
## 85 88 88 73 96
## sapana.gautam sarita.shrestha tirtha.maya.rai yamuna.karki
## 80 99 104 86
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 79 96 89 98 82 77 2 85 99 78 99 86 80 85 77 85 88 88 73 96 80 99 104 86
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("ID_4", "VDC")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## ID_4. Ward Number
## 1 2 3 4 5 6 7 8 9 10 11 12 14
## 263 246 169 237 162 290 209 155 218 11 19 22 10
## [1] "Frequency table after encoding"
## ID_4. Ward Number
## 980 981 982 983 984 985 986 987 988 989 990 991 992
## 263 246 169 237 162 290 209 155 218 11 19 22 10
## [1] "Frequency table before encoding"
## VDC. VDC code
## Barahathawa Dhungrekhola Lalbandi Malangawa NP Netraganj
## 38 29 61 27 34
## Raniganj Sankarpur Bhimeswor NP Bocha Dandakharka
## 37 28 32 17 32
## Fasku Katakuti Lamidanda Melung Pawati
## 31 31 29 34 27
## Badegau Irkhu BhoteNamlang Talamarang Ichok
## 24 33 32 28 32
## Kadambas Langarche Melamchi Anaikot Baluwapati Deupur
## 25 32 34 25 21
## Chalal Ganeshthan Kalati Bhumidanda Mahankal Chaur Methinkot Patalekhet
## 41 41 26 35 25
## Raviopi Balkot Changunarayan Chitapol Duwakot
## 42 22 35 34 27
## Gundu Madhyapur Thimi NP Nankhel Sirutar Baireni
## 33 29 33 36 36
## Dhussa Khari Kiranchok Naubise Salyantar
## 31 32 34 27 35
## Sunaula Bazar Thakre Chitlang Churiyamai Fakhel
## 33 24 27 28 30
## Padampokhari Kulekhani Nibuwatar Shreepur Chhatiwan Sisneri Mahadevsthan
## 39 37 32 37 29
## Birendranagar Jutpani Kathar Khairahani Padampur
## 18 39 37 32 38
## Parbatipur Piple Shaktikhor
## 31 37 36
## [1] "Frequency table after encoding"
## VDC. VDC code
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
## 38 29 29 32 27 34 37 28 32 17 32 31 31 29 34 27 24 33 32 28 32 25 32 34 25 21 41 41
## 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 26 35 25 42 22 35 34 27 33 29 33 36 36 31 32 34 27 35 33 24 27 28 30 39 37 32 37 29
## 337 338 339 340 341 342 343 344
## 18 39 37 32 38 31 37 36
# Focus on variables with a "Lowest Freq" of 10 or less.
haven_table("D_2")
## D_2. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41 44 31 36 34 37 34 32 28 31 29
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## 28 23 27 19 25 32 15 18 17 18 13 13 9 12 2 1
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1,
"25-34" =2,
"35-44" =3,
"45-54" =4,
"55-64" =5,
"65 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="D_2", break_points=break_age, missing=999999, value_labels=labels_age)
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 18 0 0 0 0 0 0
## 17 44 0 0 0 0 0 0
## 18 60 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 63 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 72 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 60 0 0 0 0 0
## 26 0 49 0 0 0 0 0
## 27 0 41 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 59 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 56 0 0 0 0 0
## 33 0 54 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 37 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 34 0 0 0 0
## 41 0 0 42 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 37 0 0 0
## 49 0 0 0 34 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 28 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 29 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 23 0 0
## 56 0 0 0 0 27 0 0
## 57 0 0 0 0 19 0 0
## 58 0 0 0 0 25 0 0
## 59 0 0 0 0 32 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 13 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## D_2. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older
## 537 510 400 320 207 37
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 1 2 3 4 5 6 7
break_edu <- c(0,6,9,11,12,13,16,18,19, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Refused"= 9,
"Does not apply" = 10,
"Don't Know" = 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999, value_labels=labels_edu)
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,16) [16,18) [18,19) [19,777) [777,888) [888,999) [999,1e+03)
## 0 1 0 0 0 0 0 0 0 0 0 0 0
## 1 31 0 0 0 0 0 0 0 0 0 0 0
## 2 54 0 0 0 0 0 0 0 0 0 0 0
## 3 71 0 0 0 0 0 0 0 0 0 0 0
## 4 73 0 0 0 0 0 0 0 0 0 0 0
## 5 149 0 0 0 0 0 0 0 0 0 0 0
## 6 0 69 0 0 0 0 0 0 0 0 0 0
## 7 0 85 0 0 0 0 0 0 0 0 0 0
## 8 0 120 0 0 0 0 0 0 0 0 0 0
## 9 0 0 84 0 0 0 0 0 0 0 0 0
## 10 0 0 102 0 0 0 0 0 0 0 0 0
## 11 0 0 0 296 0 0 0 0 0 0 0 0
## 12 0 0 0 0 264 0 0 0 0 0 0 0
## 13 0 0 0 0 0 62 0 0 0 0 0 0
## 14 0 0 0 0 0 9 0 0 0 0 0 0
## 17 0 0 0 0 0 0 304 0 0 0 0 0
## 18 0 0 0 0 0 0 0 237 0 0 0 0
## D_4. What is your highest completed education level? [You do not need to read the re
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 379 274 186
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 296 264 71
## Literate, but never attended school Illiterate, and never attended school
## 304 237
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Refused
## 7 8 9
## Does not apply Don't Know
## 10 11
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("D_20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table after encoding"
## D_20. How many LIVING sons and daughters do you have?
## 0 1 2 3 4 5 6 7 8 9 10 11 888
## 375 243 523 375 188 113 70 25 6 1 1 1 90
## [1] "Frequency table after encoding"
## D_20. How many LIVING sons and daughters do you have?
## 0 1 2 3 4 5 or more 888
## 375 243 523 375 188 217 90
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700 777 888 999 1000 1200 1500 1600 2000
## 132 1 1 1 1 5 2 1 4 1 22 3 1 4 5 26
## 2400 2500 2600 3000 3500 4000 4500 4800 5000 6000 6500 7000 8000 8500 9000 10000
## 2 8 1 33 2 42 3 1 99 34 1 38 20 1 24 193
## 11000 12000 12846 13000 14000 14500 15000 15500 16000 17000 18000 19000 19135 20000 21000 22000
## 3 54 1 11 10 1 174 1 23 8 15 6 1 220 2 12
## 22500 23000 24000 25000 26000 27000 27500 28000 30000 32000 33000 34000 35000 36000 37000 40000
## 1 4 5 105 1 5 1 2 177 2 1 1 54 4 1 88
## 41000 45000 48000 50000 54000 55000 57000 60000 62000 65000 66000 67000 68000 70000 75000 79000
## 3 17 1 113 2 4 2 48 1 7 1 1 1 15 4 1
## 79500 80000 85000 95000 1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000 160000 170000
## 1 13 1 1 27 1 1 1 2 1 1 1 1 11 1 1
## 2e+05 240000 250000 3e+05 320000 5e+05 6e+05 7e+05 1e+06
## 9 1 1 3 1 1 1 1 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700
## 132 1 1 1 1 5 2 1
## 777 888 999 1000 1200 1500 1600 2000
## 4 1 22 3 1 4 5 26
## 2400 2500 2600 3000 3500 4000 4500 4800
## 2 8 1 33 2 42 3 1
## 5000 6000 6500 7000 8000 8500 9000 10000
## 99 34 1 38 20 1 24 193
## 11000 12000 12846 13000 14000 14500 15000 15500
## 3 54 1 11 10 1 174 1
## 16000 17000 18000 19000 19135 20000 21000 22000
## 23 8 15 6 1 220 2 12
## 22500 23000 24000 25000 26000 27000 27500 28000
## 1 4 5 105 1 5 1 2
## 30000 32000 33000 34000 35000 36000 37000 40000
## 177 2 1 1 54 4 1 88
## 41000 45000 48000 50000 54000 55000 57000 60000
## 3 17 1 113 2 4 2 48
## 62000 65000 66000 67000 68000 70000 75000 79000
## 1 7 1 1 1 15 4 1
## 79500 80000 85000 95000 1e+05 103000 104000 110000
## 1 13 1 1 27 1 1 1
## 115000 117000 125000 130000 135000 150000 160000 170000
## 2 1 1 1 1 11 1 1
## 2e+05 or more
## 19
# !!!Include relevant variables in list below
indirect_PII <- c("D_1",
"D_9",
"Inc_16",
"Inc_20a",
"HT_1cx1")
capture_tables (indirect_PII)
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_1', 'D_2', 'D_4') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 29 variables.
## --> Categorical key variables: D_1, D_2, D_4
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_1 2 (2) 1005.500 (1005.500) 965 (965)
## D_2 6 (6) 335.167 (335.167) 37 (37)
## D_4 8 (8) 251.375 (251.375) 71 (71)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 7 (0.348%)
## - 3-anonymity: 17 (0.845%)
## - 5-anonymity: 38 (1.890%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 7 x 3
## D_1 D_2 D_4
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 1 [Male] 5 [55-64] 6 [Bachelor/Postgraduate level]
## 2 2 [Female] 5 [55-64] 4 [SLC (11)]
## 3 2 [Female] 3 [35-44] 6 [Bachelor/Postgraduate level]
## 4 2 [Female] 1 [15-24] 7 [Literate, but never attended school]
## 5 1 [Male] 1 [15-24] 8 [Illiterate, and never attended school]
## 6 1 [Male] 1 [15-24] 7 [Literate, but never attended school]
## 7 2 [Female] 5 [55-64] 5 [CLASS 12/Intermediate level (12)]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## D_1 D_2 D_4
## 492 1 5 NA
## 545 2 5 NA
## 696 2 3 NA
## 1154 2 1 NA
## 1322 1 1 NA
## 1435 1 1 NA
## 1839 2 5 NA
mydata [notAnon,"D_4"] <- 999
# !!!No open ends
# !!!No GPS
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))