rm(list=ls(all=t))
filename <- "Nepal_HT_Study_Round1_062816" # !!!Update filename
source ("functions_1.5.R")
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No Direct PII
!!!No Direct PII - team #Small locations: Encode locations with pop <100,000 using random large numbers !!!Include relevant variables, but check their population size first to confirm they are <100,000
mydata <- encode_location (variables= "VDC", missing=999999)
## [1] "Frequency table before encoding"
## VDC. VDC code
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa N.P.
## 63 66 63 63 63
## Netraganj Raniganj Sankarpur Bhimeswor N.P. Bocha
## 64 65 62 61 61
## Dandakharka Fasku Katakuti Lamidada Melung
## 62 62 63 62 64
## Pawati Badegau Talramarang BhoteNamlang Irkhu
## 64 64 64 66 62
## Ichok Kadambas Langarche Melamchi Anaikot
## 63 64 62 64 63
## BaluwapatiDeupur ChalalGaneshsthan KalatiBhumidanda MahankalChaur Methinkot
## 62 67 62 61 62
## Patalekhet RaviOpi Balkot Changunarayan Chitapol
## 64 62 59 62 63
## Duwakot Gundu Madhyapur Thimi NP Nankhel Sirutar
## 63 63 66 61 58
## Baireni Dhussa Khari Kiranchok Naubise
## 62 64 62 63 64
## Salyantar SunaulaBazar Thakre Chitlang Churiyamai
## 63 62 64 61 63
## Fakhel Kulekhani Nibuwatar Padampokhari ShreepurChhatiwan
## 62 62 60 65 62
## SisneriMahadevsthan Birendranagar Jutpani Kathar Khairahani
## 63 63 62 63 63
## Padampur Parbatipur Piple Shaktikhor Chhayachhetra
## 62 62 65 64 63
## Damachaur Devsthal Dhanwang Phalawang Sibaratha
## 63 65 64 64 62
## Siddheswar Tribeni Baijapur Binauna Chisapani
## 63 63 63 62 65
## Khaskusma Kohalpur Nepalgunj Rajhena Samserganj
## 62 62 60 62 63
## [1] "Frequency table after encoding"
## VDC. VDC code
## 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 63 66 63 63 63 64 65 62 61 61 62 62 63 62 64 64 64 64 66 62 63 64 62
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
## 64 63 62 67 62 61 62 64 62 59 62 63 63 63 66 61 58 62 64 62 63 64 63
## 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
## 62 64 61 63 62 62 60 65 62 63 63 62 63 63 62 62 65 64 63 63 65 64 64
## 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
## 62 63 63 63 62 65 62 62 60 62 63
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(0, 15,25,35,45,55,100)
labels_age <- c("Less than 15" =1,
"15-24" =2,
"25-34" =3,
"35-44" =4,
"45-54" =5,
"55 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="D_2", break_points=break_age, missing=999999, value_labels=labels_age)
## recoded
## [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
## 13 170 0 0 0 0 0 0
## 14 220 0 0 0 0 0 0
## 15 0 261 0 0 0 0 0
## 16 0 293 0 0 0 0 0
## 17 0 268 0 0 0 0 0
## 18 0 249 0 0 0 0 0
## 19 0 211 0 0 0 0 0
## 20 0 238 0 0 0 0 0
## 21 0 190 0 0 0 0 0
## 22 0 170 0 0 0 0 0
## 23 0 139 0 0 0 0 0
## 24 0 101 0 0 0 0 0
## 25 0 0 145 0 0 0 0
## 26 0 0 130 0 0 0 0
## 27 0 0 119 0 0 0 0
## 28 0 0 110 0 0 0 0
## 29 0 0 105 0 0 0 0
## 30 0 0 151 0 0 0 0
## 31 0 0 86 0 0 0 0
## 32 0 0 95 0 0 0 0
## 33 0 0 75 0 0 0 0
## 34 0 0 77 0 0 0 0
## 35 0 0 0 101 0 0 0
## 36 0 0 0 62 0 0 0
## 37 0 0 0 75 0 0 0
## 38 0 0 0 57 0 0 0
## 39 0 0 0 64 0 0 0
## 40 0 0 0 95 0 0 0
## 41 0 0 0 42 0 0 0
## 42 0 0 0 69 0 0 0
## 43 0 0 0 56 0 0 0
## 44 0 0 0 57 0 0 0
## 45 0 0 0 0 69 0 0
## 46 0 0 0 0 59 0 0
## 47 0 0 0 0 43 0 0
## 48 0 0 0 0 61 0 0
## 49 0 0 0 0 47 0 0
## 50 0 0 0 0 47 0 0
## 51 0 0 0 0 41 0 0
## 52 0 0 0 0 37 0 0
## 53 0 0 0 0 37 0 0
## 54 0 0 0 0 37 0 0
## 55 0 0 0 0 0 52 0
## 56 0 0 0 0 0 27 0
## 57 0 0 0 0 0 25 0
## 58 0 0 0 0 0 31 0
## 59 0 0 0 0 0 27 0
## 60 0 0 0 0 0 41 0
## 61 0 0 0 0 0 15 0
## 62 0 0 0 0 0 26 0
## 63 0 0 0 0 0 11 0
## 64 0 0 0 0 0 10 0
## 65 0 0 0 0 0 2 0
## 66 0 0 0 0 0 1 0
## D_2. How old are you? [Use the timeline in the manual if the respondent has a ha
## Less than 15 15-24 25-34 35-44 45-54 55 and older
## 390 2120 1093 678 478 268
## [1] "Inspect value labels and relabel as necessary"
## Less than 15 15-24 25-34 35-44 45-54 55 and older NA
## 1 2 3 4 5 6 7
# Recode education into standard categories
break_edu <- c(0,6,9,11,12,13,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Refused"= 9,
"Does not apply" = 10,
"Don't Know" = 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999, value_labels=labels_edu)
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,17) [17,18) [18,777) [777,888) [888,999) [999,1e+03)
## 0 6 0 0 0 0 0 0 0 0 0 0
## 1 91 0 0 0 0 0 0 0 0 0 0
## 2 167 0 0 0 0 0 0 0 0 0 0
## 3 184 0 0 0 0 0 0 0 0 0 0
## 4 231 0 0 0 0 0 0 0 0 0 0
## 5 347 0 0 0 0 0 0 0 0 0 0
## 6 0 277 0 0 0 0 0 0 0 0 0
## 7 0 380 0 0 0 0 0 0 0 0 0
## 8 0 423 0 0 0 0 0 0 0 0 0
## 9 0 0 312 0 0 0 0 0 0 0 0
## 10 0 0 322 0 0 0 0 0 0 0 0
## 11 0 0 0 767 0 0 0 0 0 0 0
## 12 0 0 0 0 443 0 0 0 0 0 0
## 13 0 0 0 0 0 81 0 0 0 0 0
## 14 0 0 0 0 0 24 0 0 0 0 0
## 17 0 0 0 0 0 0 357 0 0 0 0
## 18 0 0 0 0 0 0 0 601 0 0 0
## D_4. What is your highest completed education level? [You do not need to read the
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1026 1080 634
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 767 443 105
## Literate, but never attended school Illiterate, and never attended school <NA>
## 357 601 14
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Refused
## 7 8 9
## Does not apply Don't Know
## 10 11
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("D_20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more children
## [1] "Frequency table after encoding"
## D_20. How many children do you have?
## 0 1 2 3 4 5 6 7 8 9 10 11 888
## 2213 591 840 655 342 189 103 37 15 4 2 2 34
## [1] "Frequency table after encoding"
## D_20. How many children do you have?
## 0 1 2 3 4 5 or more 888
## 2213 591 840 655 342 352 34
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 10 15 30 35 36 300 400 500 600 777 888 999 1000
## 169 1 1 1 1 1 1 3 8 1 59 1 310 31
## 1200 1250 1300 1400 1500 1600 2000 2083 2100 2200 2300 2500 2600 3000
## 11 1 1 1 34 2 110 1 1 1 1 22 1 165
## 3500 3800 4000 4100 4500 5000 5500 5600 6000 6500 7000 7500 8000 8500
## 7 1 156 1 10 434 2 1 171 2 154 8 149 1
## 9000 9500 10000 10400 10500 11000 12000 12200 12500 12800 13000 14000 15000 16000
## 75 1 505 1 2 12 194 1 7 1 52 29 484 41
## 17000 17200 18000 19000 20000 21000 22000 23000 24000 24400 24500 25000 26000 27000
## 25 1 52 4 419 8 22 5 10 1 1 238 8 4
## 28000 29000 30000 31000 32000 33000 34500 35000 36000 38000 39000 40000 41000 41600
## 8 3 225 3 7 5 1 72 5 1 1 98 1 1
## 42000 43000 44000 45000 50000 52000 53000 55000 58000 59500 60000 65000 66000 70000
## 2 3 1 24 124 1 1 8 2 1 50 1 1 14
## 75000 80000 85000 90000 1e+05 108000 109000 110000 120000 125000 130000 150000 180000 190000
## 5 17 4 7 33 1 1 1 4 2 1 14 1 1
## 2e+05 3e+05 320000 350000 4e+05 5e+05 630000 7e+05 1500000 2e+06
## 22 1 1 1 1 1 1 1 1 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 10 15 30 35 36 300 400
## 169 1 1 1 1 1 1 3
## 500 600 777 888 999 1000 1200 1250
## 8 1 59 1 310 31 11 1
## 1300 1400 1500 1600 2000 2083 2100 2200
## 1 1 34 2 110 1 1 1
## 2300 2500 2600 3000 3500 3800 4000 4100
## 1 22 1 165 7 1 156 1
## 4500 5000 5500 5600 6000 6500 7000 7500
## 10 434 2 1 171 2 154 8
## 8000 8500 9000 9500 10000 10400 10500 11000
## 149 1 75 1 505 1 2 12
## 12000 12200 12500 12800 13000 14000 15000 16000
## 194 1 7 1 52 29 484 41
## 17000 17200 18000 19000 20000 21000 22000 23000
## 25 1 52 4 419 8 22 5
## 24000 24400 24500 25000 26000 27000 28000 29000
## 10 1 1 238 8 4 8 3
## 30000 31000 32000 33000 34500 35000 36000 38000
## 225 3 7 5 1 72 5 1
## 39000 40000 41000 41600 42000 43000 44000 45000
## 1 98 1 1 2 3 1 24
## 50000 52000 53000 55000 58000 59500 60000 65000
## 124 1 1 8 2 1 50 1
## 66000 70000 75000 80000 85000 90000 1e+05 108000
## 1 14 5 17 4 7 33 1
## 109000 110000 120000 125000 130000 150000 180000 190000
## 1 1 4 2 1 14 1 1
## 2e+05 or more
## 31
percentile_99.5 <- floor(quantile(mydata$Inc_23[mydata$Inc_23!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_23", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777 800 900 999 1000 1070 1100 1200 1300
## 1 1 3 1 18 1 1 4 1 1 179 39 1 1 14 2
## 1400 1500 1600 1800 2000 2200 2400 2500 2600 2800 3000 3500 4000 4009 4500 5000
## 1 38 1 1 167 3 1 61 1 1 310 29 330 1 17 721
## 5500 6000 6500 7000 7500 8000 9000 10000 11000 12000 13000 13500 14000 15000 16000 17000
## 2 352 1 286 3 272 103 773 7 181 47 1 16 446 14 8
## 18000 19000 20000 21000 22000 23000 24000 25000 27000 28000 30000 32000 32500 35000 40000 42000
## 24 1 251 4 13 4 4 95 3 1 78 2 1 17 23 1
## 45000 50000 60000 70000 80000 90000 1e+05 120000 140000 2e+05 9e+05
## 4 17 9 3 2 1 2 1 1 1 1
## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777
## 1 1 3 1 18 1 1 4
## 800 900 999 1000 1070 1100 1200 1300
## 1 1 179 39 1 1 14 2
## 1400 1500 1600 1800 2000 2200 2400 2500
## 1 38 1 1 167 3 1 61
## 2600 2800 3000 3500 4000 4009 4500 5000
## 1 1 310 29 330 1 17 721
## 5500 6000 6500 7000 7500 8000 9000 10000
## 2 352 1 286 3 272 103 773
## 11000 12000 13000 13500 14000 15000 16000 17000
## 7 181 47 1 16 446 14 8
## 18000 19000 20000 21000 22000 23000 24000 25000
## 24 1 251 4 13 4 4 95
## 27000 28000 30000 32000 32500 35000 40000 42000
## 3 1 78 2 1 17 23 1
## 45000 50000 or more
## 4 38
# !!!Include relevant variables in list below
indirect_PII <- c("D_3",
"D_4",
"D_6",
"D_9",
"EM_16_1",
"EM_16_2",
"EM_16_3",
"EM_16_6",
"CM_1C",
"CM_2",
"ME_1",
"ME_3",
"ME_5",
"ME_7",
"ME_13",
"ME_14",
"ME_16")
capture_tables (indirect_PII)
# Encode caste
mydata <- encode_location (variables= "D_3", missing=999999)
## [1] "Frequency table before encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## chhetri BRAHMAN (HILL) magar tharu tamang newar
## 1296 704 302 206 1078 391
## muslim kami yadav rai gurung DAMAIN/DHOLI
## 7 195 21 12 51 93
## limbu thakuri sarki teli CHAMAR/HARIJAN/RAM koiri
## 2 80 68 6 2 74
## kurmi DUSADH/PASWAN/PASI sonar BRAHMAN (TARAI) GHARTI/BHUJEL malla
## 1 9 11 9 49 1
## kalwar kumal HAJAM/THAKUR sunuwar sudhi lohar
## 11 35 3 3 1 5
## tatma khatwe majhi nuniya kumhar danuwar
## 3 3 6 2 3 4
## CHEPANG/PRAJA haluwai rajput kayastha badhae marwadi
## 92 1 5 8 1 4
## thami darai pahari dom bote ADIBASI/JANAJATI
## 18 14 10 1 1 1
## badi OTHER CASTE <NA>
## 4 118 2
## [1] "Frequency table after encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 1296 704 302 206 1078 391 7 195 21 12 51 93 2 80 68 6 2 74 1 9 11 9 49
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
## 1 11 35 3 3 1 5 3 3 6 2 3 4 92 1 5 8 1 4 18 14 10 1
## 1026 1027 1028 1029 <NA>
## 1 1 4 118 2
# Recode religion
break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
"Buddhist" = 2,
"Other" = 3,
"Refused" = 4,
"Not applicable" = 5,
"Don't know" = 6)
mydata <- ordinal_recode (variable="D_6", break_points=break_rel, missing=999, value_labels=labels_rel)
## recoded
## [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
## 1 3941 0 0 0 0 0
## 2 0 944 0 0 0 0
## 3 0 0 12 0 0 0
## 4 0 0 2 0 0 0
## 6 0 0 116 0 0 0
## 9 0 0 3 0 0 0
## D_6. What is your religious background? [You do not need to read the response choi
## Hindu Buddhist Other <NA>
## 3941 944 133 9
## [1] "Inspect value labels and relabel as necessary"
## Hindu Buddhist Other Refused Not applicable Don't know
## 1 2 3 4 5 6
# Recode reason for moving
break_mov <- c(1,2,3, 4, 777, 888, 999)
labels_mov <- c("Family reasons (e.g., marriage)" = 1,
"For education / training" = 2,
"For work" = 3,
"Other" = 4,
"Refused to asnwer" = 5,
"Not applicable" = 6,
"Don't know" = 7)
mydata <- ordinal_recode (variable="CM_2", break_points=break_mov, missing=999, value_labels=labels_mov)
## recoded
## [1,2) [2,3) [3,4) [4,777) [777,888) [888,999) [999,1e+03)
## 1 491 0 0 0 0 0 0
## 2 0 99 0 0 0 0 0
## 3 0 0 966 0 0 0 0
## 4 0 0 0 7 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 0 126 0 0 0
## 7 0 0 0 25 0 0 0
## CM_2. Primary Reason for Migrating
## Family reasons (e.g., marriage) For education / training For work
## 491 99 966
## Other <NA>
## 159 3312
## [1] "Inspect value labels and relabel as necessary"
## Family reasons (e.g., marriage) For education / training For work
## 1 2 3
## Other Refused to asnwer Not applicable
## 4 5 6
## Don't know
## 7
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_1','D_2','D_4') ##!!! Replace with candidate categorical demo vars
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 5027 rows and 171 variables.
## --> Categorical key variables: D_1, D_2, D_4
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_1 2 (2) 2513.500 (2513.500) 2510 (2510)
## D_2 6 (6) 837.833 (837.833) 268 (268)
## D_4 9 (9) 626.625 (626.625) 105 (105)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 3 (0.060%)
## - 3-anonymity: 6 (0.119%)
## - 5-anonymity: 12 (0.239%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 3 x 3
## D_1 D_2 D_4
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [Female] 1 [Less than 15] 4 [SLC (11)]
## 2 2 [Female] 5 [45-54] 6 [Bachelor/Postgraduate level]
## 3 2 [Female] 5 [45-54] 5 [CLASS 12/Intermediate level (12)]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## D_1 D_2 D_4
## 253 2 1 NA
## 2170 2 5 NA
## 4874 2 5 NA
mydata [notAnon,"D_4"] <- 9
report_open (list_open_ends = "HTV_1_10_TEXT")
## Warning in dir.create(file.path(getwd(), "verbatims"), recursive = TRUE): 'C:\Users\C_Pablo_Diego-
## Rosell\Desktop\Other Projects\Dwight\ILAB PII\Data\FINAL\UC Berkeley_Nepal_Awareness-General
## Public\PublicData_R3\Data\Nepal_HT_Study_Round1_062816\verbatims' already exists
# Remove as only verbatim variable in Nepali
mydata <- mydata[!names(mydata) %in% "HTV_1_10_TEXT"] # Drop as actually verbatim data in Nepali
# !!!No GPS
# Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))