rm(list=ls(all=t))
filename <- "Round1_demo" # !!!Update filename
source ("functions_1.5.R")
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No direct PII variables
# !!!No direct PII - team variables
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("VDC")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## VDC. VDC code
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa N.P.
## 63 66 63 63 63
## Netraganj Raniganj Sankarpur Bhimeswor N.P. Bocha
## 64 65 62 61 61
## Dandakharka Fasku Katakuti Lamidada Melung
## 62 62 63 62 64
## Pawati Badegau Talramarang BhoteNamlang Irkhu
## 64 64 64 66 62
## Ichok Kadambas Langarche Melamchi Anaikot
## 63 64 62 64 63
## BaluwapatiDeupur ChalalGaneshsthan KalatiBhumidanda MahankalChaur Methinkot
## 62 67 62 61 62
## Patalekhet RaviOpi Balkot Changunarayan Chitapol
## 64 62 59 62 63
## Duwakot Gundu Madhyapur Thimi NP Nankhel Sirutar
## 63 63 66 61 58
## Baireni Dhussa Khari Kiranchok Naubise
## 62 64 62 63 64
## Salyantar SunaulaBazar Thakre Chitlang Churiyamai
## 63 62 64 61 63
## Fakhel Kulekhani Nibuwatar Padampokhari ShreepurChhatiwan
## 62 62 60 65 62
## SisneriMahadevsthan Birendranagar Jutpani Kathar Khairahani
## 63 63 62 63 63
## Padampur Parbatipur Piple Shaktikhor
## 62 62 65 64
## [1] "Frequency table after encoding"
## VDC. VDC code
## 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 63 66 63 63 63 64 65 62 61 61 62 62 63 62 64 64 64 64 66 62 63 64 62
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
## 64 63 62 67 62 61 62 64 62 59 62 63 63 63 66 61 58 62 64 62 63 64 63
## 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
## 62 64 61 63 62 62 60 65 62 63 63 62 63 63 62 62 65 64
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(0, 15,25,35,45,55,100)
labels_age <- c("Less than 15" =1,
"15-24" =2,
"25-34" =3,
"35-44" =4,
"45-54" =5,
"55 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="D_2", break_points=break_age, missing=999999, value_labels=labels_age)
## recoded
## [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
## 13 132 0 0 0 0 0 0
## 14 171 0 0 0 0 0 0
## 15 0 199 0 0 0 0 0
## 16 0 244 0 0 0 0 0
## 17 0 220 0 0 0 0 0
## 18 0 210 0 0 0 0 0
## 19 0 171 0 0 0 0 0
## 20 0 193 0 0 0 0 0
## 21 0 150 0 0 0 0 0
## 22 0 138 0 0 0 0 0
## 23 0 99 0 0 0 0 0
## 24 0 81 0 0 0 0 0
## 25 0 0 109 0 0 0 0
## 26 0 0 100 0 0 0 0
## 27 0 0 93 0 0 0 0
## 28 0 0 94 0 0 0 0
## 29 0 0 81 0 0 0 0
## 30 0 0 115 0 0 0 0
## 31 0 0 71 0 0 0 0
## 32 0 0 68 0 0 0 0
## 33 0 0 59 0 0 0 0
## 34 0 0 63 0 0 0 0
## 35 0 0 0 77 0 0 0
## 36 0 0 0 46 0 0 0
## 37 0 0 0 56 0 0 0
## 38 0 0 0 42 0 0 0
## 39 0 0 0 54 0 0 0
## 40 0 0 0 74 0 0 0
## 41 0 0 0 32 0 0 0
## 42 0 0 0 54 0 0 0
## 43 0 0 0 44 0 0 0
## 44 0 0 0 51 0 0 0
## 45 0 0 0 0 56 0 0
## 46 0 0 0 0 46 0 0
## 47 0 0 0 0 37 0 0
## 48 0 0 0 0 50 0 0
## 49 0 0 0 0 40 0 0
## 50 0 0 0 0 43 0 0
## 51 0 0 0 0 33 0 0
## 52 0 0 0 0 33 0 0
## 53 0 0 0 0 32 0 0
## 54 0 0 0 0 34 0 0
## 55 0 0 0 0 0 44 0
## 56 0 0 0 0 0 24 0
## 57 0 0 0 0 0 21 0
## 58 0 0 0 0 0 28 0
## 59 0 0 0 0 0 24 0
## 60 0 0 0 0 0 31 0
## 61 0 0 0 0 0 11 0
## 62 0 0 0 0 0 21 0
## 63 0 0 0 0 0 11 0
## 64 0 0 0 0 0 9 0
## 65 0 0 0 0 0 1 0
## 66 0 0 0 0 0 1 0
## D_2. How old are you?<U+00A0> [Use the timeline in the manual if the respondent has a ha
## Less than 15 15-24 25-34 35-44 45-54 55 and older
## 303 1705 853 530 404 226
## [1] "Inspect value labels and relabel as necessary"
## Less than 15 15-24 25-34 35-44 45-54 55 and older NA
## 1 2 3 4 5 6 7
break_edu <- c(0,6,9,11,12,13,16,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Refused"= 9,
"Does not apply" = 10,
"Don't Know" = 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999, value_labels=labels_edu)
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,16) [16,17) [17,18) [18,777) [777,888) [888,999) [999,1e+03)
## 0 6 0 0 0 0 0 0 0 0 0 0 0
## 1 85 0 0 0 0 0 0 0 0 0 0 0
## 2 136 0 0 0 0 0 0 0 0 0 0 0
## 3 151 0 0 0 0 0 0 0 0 0 0 0
## 4 194 0 0 0 0 0 0 0 0 0 0 0
## 5 264 0 0 0 0 0 0 0 0 0 0 0
## 6 0 218 0 0 0 0 0 0 0 0 0 0
## 7 0 287 0 0 0 0 0 0 0 0 0 0
## 8 0 323 0 0 0 0 0 0 0 0 0 0
## 9 0 0 238 0 0 0 0 0 0 0 0 0
## 10 0 0 240 0 0 0 0 0 0 0 0 0
## 11 0 0 0 640 0 0 0 0 0 0 0 0
## 12 0 0 0 0 363 0 0 0 0 0 0 0
## 13 0 0 0 0 0 59 0 0 0 0 0 0
## 14 0 0 0 0 0 15 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 299 0 0 0 0
## 18 0 0 0 0 0 0 0 0 490 0 0 0
## D_4. What is your highest completed education level?<U+00A0> [You do not need to read the
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 836 828 478
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 640 363 74
## Illiterate, and never attended school Refused <NA>
## 299 490 13
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Refused
## 7 8 9
## Does not apply Don't Know
## 10 11
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("D_20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table after encoding"
## D_20. How many children do you have?
## 0 1 2 3 4 5 6 7 8 9 10 11 888
## 1816 443 662 507 267 153 92 29 12 2 2 2 34
## [1] "Frequency table after encoding"
## D_20. How many children do you have?
## 0 1 2 3 4 5 or more 888
## 1816 443 662 507 267 292 34
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 15 30 35 36 300 400 500 600 777 888 999 1000 1200 1250 1300
## 132 1 1 1 1 1 1 6 1 49 1 249 25 9 1 1
## 1500 1600 2000 2083 2100 2200 2300 2500 2600 3000 3500 3800 4000 4100 4500 5000
## 25 2 85 1 1 1 1 16 1 140 4 1 129 1 8 352
## 5500 6000 6500 7000 7500 8000 9000 9500 10000 10400 10500 11000 12000 12200 12500 13000
## 2 142 2 121 8 111 63 1 399 1 1 9 155 1 6 40
## 14000 15000 16000 17000 17200 18000 19000 20000 21000 22000 23000 24000 24400 25000 26000 27000
## 20 408 35 16 1 39 2 338 7 16 5 7 1 193 7 3
## 28000 29000 30000 31000 32000 33000 34500 35000 36000 38000 40000 41000 41600 42000 43000 45000
## 6 2 176 2 2 4 1 61 2 1 80 1 1 2 2 22
## 50000 53000 55000 58000 60000 65000 66000 70000 75000 80000 85000 90000 1e+05 108000 110000 120000
## 93 1 4 2 40 1 1 11 1 15 1 5 29 1 1 2
## 125000 130000 150000 2e+05 320000 350000 4e+05 5e+05 7e+05 2e+06
## 2 1 13 18 1 1 1 1 1 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 15 30 35 36 300 400 500
## 132 1 1 1 1 1 1 6
## 600 777 888 999 1000 1200 1250 1300
## 1 49 1 249 25 9 1 1
## 1500 1600 2000 2083 2100 2200 2300 2500
## 25 2 85 1 1 1 1 16
## 2600 3000 3500 3800 4000 4100 4500 5000
## 1 140 4 1 129 1 8 352
## 5500 6000 6500 7000 7500 8000 9000 9500
## 2 142 2 121 8 111 63 1
## 10000 10400 10500 11000 12000 12200 12500 13000
## 399 1 1 9 155 1 6 40
## 14000 15000 16000 17000 17200 18000 19000 20000
## 20 408 35 16 1 39 2 338
## 21000 22000 23000 24000 24400 25000 26000 27000
## 7 16 5 7 1 193 7 3
## 28000 29000 30000 31000 32000 33000 34500 35000
## 6 2 176 2 2 4 1 61
## 36000 38000 40000 41000 41600 42000 43000 45000
## 2 1 80 1 1 2 2 22
## 50000 53000 55000 58000 60000 65000 66000 70000
## 93 1 4 2 40 1 1 11
## 75000 80000 85000 90000 1e+05 108000 110000 120000
## 1 15 1 5 29 1 1 2
## 125000 130000 150000 2e+05 or more
## 2 1 13 24
percentile_99.5 <- floor(quantile(mydata$Inc_23[mydata$Inc_23!=999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_23", break_point=percentile_99.5, missing=999)
## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777 800 900 999 1000 1070 1100 1200 1300
## 1 1 2 1 16 1 1 3 1 1 135 35 1 1 12 1
## 1400 1500 1600 1800 2000 2200 2500 2600 3000 3500 4000 4009 4500 5000 5500 6000
## 1 29 1 1 131 3 52 1 255 23 260 1 14 559 1 267
## 7000 7500 8000 9000 10000 11000 12000 13000 13500 14000 15000 16000 17000 18000 19000 20000
## 221 3 225 82 627 4 152 40 1 13 361 8 7 21 1 200
## 21000 22000 23000 24000 25000 27000 30000 32000 32500 35000 40000 42000 45000 50000 60000 70000
## 4 11 3 2 85 3 68 2 1 13 17 1 4 11 8 2
## 80000 90000 1e+05 120000 2e+05 9e+05
## 2 1 2 1 1 1
## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777
## 1 1 2 1 16 1 1 3
## 800 900 999 1000 1070 1100 1200 1300
## 1 1 135 35 1 1 12 1
## 1400 1500 1600 1800 2000 2200 2500 2600
## 1 29 1 1 131 3 52 1
## 3000 3500 4000 4009 4500 5000 5500 6000
## 255 23 260 1 14 559 1 267
## 7000 7500 8000 9000 10000 11000 12000 13000
## 221 3 225 82 627 4 152 40
## 13500 14000 15000 16000 17000 18000 19000 20000
## 1 13 361 8 7 21 1 200
## 21000 22000 23000 24000 25000 27000 30000 32000
## 4 11 3 2 85 3 68 2
## 32500 35000 40000 42000 45000 50000 or more
## 1 13 17 1 4 29
# !!!Include relevant variables in list below
indirect_PII <- c("D_1",
"D_3",
"D_6",
"D_9",
"D_20",
"Inc_16",
"Inc_20a",
"ME_1",
"ME_3",
"ME_5",
"ME_7",
"ME_13",
"ME_14",
"ME_16",
"ME_17")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
mydata <- encode_location (variables= "D_3", missing=999999)
## [1] "Frequency table before encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## chhetri BRAHMAN (HILL) magar tharu tamang newar
## 846 619 206 61 1078 383
## muslim kami yadav rai gurung DAMAIN/DHOLI
## 5 117 13 11 50 70
## limbu thakuri sarki teli CHAMAR/HARIJAN/RAM koiri
## 2 42 66 6 2 74
## kurmi DUSADH/PASWAN/PASI sonar BRAHMAN (TARAI) GHARTI/BHUJEL malla
## 1 6 5 6 48 1
## kalwar kumal HAJAM/THAKUR sunuwar sudhi lohar
## 11 25 2 2 1 3
## tatma khatwe majhi nuniya kumhar danuwar
## 3 1 6 2 3 4
## CHEPANG/PRAJA haluwai rajput kayastha badhae marwadi
## 92 1 3 8 1 4
## thami darai pahari dom bote ADIBASI/JANAJATI
## 18 14 8 1 1 1
## OTHER CASTE <NA>
## 87 1
## [1] "Frequency table after encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 846 619 206 61 1078 383 5 117 13 11 50 70 2 42 66 6 2 74 1 6 5 6 48
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
## 1 11 25 2 2 1 3 3 1 6 2 3 4 92 1 3 8 1 4 18 14 8 1
## 1026 1027 1028 <NA>
## 1 1 87 1
break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
"Buddhist" = 2,
"Other" = 3,
"Refused" = 4,
"Not applicable" = 5,
"Don't know" = 6)
mydata <- ordinal_recode (variable="D_6", break_points=break_rel, missing=999, value_labels=labels_rel)
## recoded
## [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
## 1 2964 0 0 0 0 0
## 2 0 941 0 0 0 0
## 3 0 0 9 0 0 0
## 4 0 0 2 0 0 0
## 6 0 0 97 0 0 0
## 9 0 0 3 0 0 0
## D_6. What is your religious background? [You do not need to read the response choi
## Hindu Buddhist Other <NA>
## 2964 941 111 5
## [1] "Inspect value labels and relabel as necessary"
## Hindu Buddhist Other Refused Not applicable Don't know
## 1 2 3 4 5 6
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_1', 'D_2', 'D_4') ##!!! Replace with candidate categorical demo vars
# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4021 rows and 31 variables.
## --> Categorical key variables: D_1, D_2, D_4
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_1 2 (2) 2010.500 (2010.500) 2006 (2006)
## D_2 6 (6) 670.167 (670.167) 226 (226)
## D_4 9 (9) 501.000 (501.000) 74 (74)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 2 (0.050%)
## - 3-anonymity: 3 (0.075%)
## - 5-anonymity: 23 (0.572%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 2 x 3
## D_1 D_2 D_4
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [Female] 1 [Less than 15] 4 [SLC (11)]
## 2 2 [Female] 5 [45-54] 6 [Bachelor/Postgraduate level]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## D_1 D_2 D_4
## 226 2 1 NA
## 2180 2 5 NA
mydata [notAnon,"D_4"] <- 999
# !!!No open ends
# !!!No GPS
# Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))