rm(list=ls(all=t))
filename <- "Nepal_HT_Study_Rounds_1_2_3_Processed" # !!!Update filename
functions_vers <- "functions_1.6.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables
# !!!No direct PII
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal
## 15052 79 96 89 98
## ashish.shrestha bhanu.bhakta.dhakal dev.raj.nepal dhan.kumari.darlami gita.maharjan
## 82 77 2 85 99
## kamala.sharma manjula.giri min.kumari.shrestha nabina.khadka niraj.shrestha
## 78 99 86 80 85
## pramila.shrestha pratika.shrestha rabischandra.bhatta ram.kumar.acharya sajina.shrestha
## 77 85 88 88 73
## sandip.shrestha sapana.gautam sarita.shrestha tirtha.maya.rai yamuna.karki
## 96 80 99 104 86
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 15052 79 96 89 98 82 77 2 85 99 78 99 86 80 85 77 85 88 88
## 20 21 22 23 24 25
## 73 96 80 99 104 86
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("dist_vdc", "ward")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## dist_vdc. VDC code
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa N.P.
## 227 224 214 219 216
## Netraganj Raniganj Sankarpur Bhimeswor N.P. Bocha
## 226 231 213 215 200
## Dandakharka Fasku Katakuti Lamidada Melung
## 218 217 220 215 226
## Pawati Badegau Talramarang BhoteNamlang Irkhu
## 218 215 225 229 214
## Ichok Kadambas Langarche Melamchi Anaikot
## 220 216 218 224 214
## BaluwapatiDeupur ChalalGaneshsthan KalatiBhumidanda MahankalChaur Methinkot
## 207 242 227 209 221
## Patalekhet RaviOpi Balkot Changunarayan Chitapol
## 217 228 199 221 223
## Duwakot Gundu Madhyapur Thimi NP Nankhel Sirutar
## 216 222 227 216 210
## Baireni Dhussa Khari Kiranchok Naubise
## 222 223 218 223 218
## Salyantar SunaulaBazar Thakre Chitlang Churiyamai
## 224 219 215 210 217
## Fakhel Kulekhani Nibuwatar Padampokhari ShreepurChhatiwan
## 216 223 212 234 223
## SisneriMahadevsthan Birendranagar Jutpani Kathar Khairahani
## 218 207 225 226 221
## Padampur Parbatipur Piple Shaktikhor Chhayachhetra
## 224 215 232 226 188
## Damachaur Devsthal Dhanwang Phalawang Sibaratha
## 189 195 190 192 186
## Siddheswar Tribeni Baijapur Binauna Chisapani
## 189 187 189 186 195
## Khaskusma Kohalpur Nepalgunj Rajhena Samserganj
## 186 186 180 186 189
## [1] "Frequency table after encoding"
## dist_vdc. VDC code
## 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
## 234 222 186 214 228 217 224 215 199 188 214 227 220 216 210 186 216 215 219 232 195 210 209 221 223 226 226 215 226
## 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
## 200 216 221 180 189 189 215 195 216 213 223 226 207 227 207 222 189 218 217 216 187 231 224 190 186 218 223 186 225
## 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
## 221 220 242 214 215 218 217 223 227 219 189 186 212 218 218 224 225 218 223 224 229 192
## [1] "Frequency table before encoding"
## ward. Ward Number
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 913 883 563 813 566 940 737 578 787 36 47 60 38 48 10054
## [1] "Frequency table after encoding"
## ward. Ward Number
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 <NA>
## 60 787 883 737 813 578 566 913 563 940 36 48 38 47 10054
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(0, 15,25,35,45,55,100)
labels_age <- c("Less than 15" =1,
"15-24" =2,
"25-34" =3,
"35-44" =4,
"45-54" =5,
"55 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="age", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## age. Age
## 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## 564 717 845 955 877 804 701 786 629 563 459 331 490 441 411 384 365 530 294 328 264 263 361 217 260 195 228 337 152
## 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## 246 198 205 245 208 154 218 166 172 143 131 136 131 190 98 93 111 95 141 55 92 41 34 6 3
## recoded
## [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
## 13 564 0 0 0 0 0 0
## 14 717 0 0 0 0 0 0
## 15 0 845 0 0 0 0 0
## 16 0 955 0 0 0 0 0
## 17 0 877 0 0 0 0 0
## 18 0 804 0 0 0 0 0
## 19 0 701 0 0 0 0 0
## 20 0 786 0 0 0 0 0
## 21 0 629 0 0 0 0 0
## 22 0 563 0 0 0 0 0
## 23 0 459 0 0 0 0 0
## 24 0 331 0 0 0 0 0
## 25 0 0 490 0 0 0 0
## 26 0 0 441 0 0 0 0
## 27 0 0 411 0 0 0 0
## 28 0 0 384 0 0 0 0
## 29 0 0 365 0 0 0 0
## 30 0 0 530 0 0 0 0
## 31 0 0 294 0 0 0 0
## 32 0 0 328 0 0 0 0
## 33 0 0 264 0 0 0 0
## 34 0 0 263 0 0 0 0
## 35 0 0 0 361 0 0 0
## 36 0 0 0 217 0 0 0
## 37 0 0 0 260 0 0 0
## 38 0 0 0 195 0 0 0
## 39 0 0 0 228 0 0 0
## 40 0 0 0 337 0 0 0
## 41 0 0 0 152 0 0 0
## 42 0 0 0 246 0 0 0
## 43 0 0 0 198 0 0 0
## 44 0 0 0 205 0 0 0
## 45 0 0 0 0 245 0 0
## 46 0 0 0 0 208 0 0
## 47 0 0 0 0 154 0 0
## 48 0 0 0 0 218 0 0
## 49 0 0 0 0 166 0 0
## 50 0 0 0 0 172 0 0
## 51 0 0 0 0 143 0 0
## 52 0 0 0 0 131 0 0
## 53 0 0 0 0 136 0 0
## 54 0 0 0 0 131 0 0
## 55 0 0 0 0 0 190 0
## 56 0 0 0 0 0 98 0
## 57 0 0 0 0 0 93 0
## 58 0 0 0 0 0 111 0
## 59 0 0 0 0 0 95 0
## 60 0 0 0 0 0 141 0
## 61 0 0 0 0 0 55 0
## 62 0 0 0 0 0 92 0
## 63 0 0 0 0 0 41 0
## 64 0 0 0 0 0 34 0
## 65 0 0 0 0 0 6 0
## 66 0 0 0 0 0 3 0
## [1] "Frequency table after encoding"
## age. Age
## Less than 15 15-24 25-34 35-44 45-54 55 and older
## 1281 6950 3770 2399 1704 959
## [1] "Inspect value labels and relabel as necessary"
## Less than 15 15-24 25-34 35-44 45-54 55 and older NA
## 1 2 3 4 5 6 7
mydata <- ordinal_recode (variable="ager", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## ager. Average Age (years)
## 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## 564 717 845 955 877 804 701 786 629 563 459 331 490 441 411 384 365 530 294 328 264 263 361 217 260 195 228 337 152
## 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## 246 198 205 245 208 154 218 166 172 143 131 136 131 190 98 93 111 95 141 55 92 41 34 6 3
## recoded
## [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
## 13 564 0 0 0 0 0 0
## 14 717 0 0 0 0 0 0
## 15 0 845 0 0 0 0 0
## 16 0 955 0 0 0 0 0
## 17 0 877 0 0 0 0 0
## 18 0 804 0 0 0 0 0
## 19 0 701 0 0 0 0 0
## 20 0 786 0 0 0 0 0
## 21 0 629 0 0 0 0 0
## 22 0 563 0 0 0 0 0
## 23 0 459 0 0 0 0 0
## 24 0 331 0 0 0 0 0
## 25 0 0 490 0 0 0 0
## 26 0 0 441 0 0 0 0
## 27 0 0 411 0 0 0 0
## 28 0 0 384 0 0 0 0
## 29 0 0 365 0 0 0 0
## 30 0 0 530 0 0 0 0
## 31 0 0 294 0 0 0 0
## 32 0 0 328 0 0 0 0
## 33 0 0 264 0 0 0 0
## 34 0 0 263 0 0 0 0
## 35 0 0 0 361 0 0 0
## 36 0 0 0 217 0 0 0
## 37 0 0 0 260 0 0 0
## 38 0 0 0 195 0 0 0
## 39 0 0 0 228 0 0 0
## 40 0 0 0 337 0 0 0
## 41 0 0 0 152 0 0 0
## 42 0 0 0 246 0 0 0
## 43 0 0 0 198 0 0 0
## 44 0 0 0 205 0 0 0
## 45 0 0 0 0 245 0 0
## 46 0 0 0 0 208 0 0
## 47 0 0 0 0 154 0 0
## 48 0 0 0 0 218 0 0
## 49 0 0 0 0 166 0 0
## 50 0 0 0 0 172 0 0
## 51 0 0 0 0 143 0 0
## 52 0 0 0 0 131 0 0
## 53 0 0 0 0 136 0 0
## 54 0 0 0 0 131 0 0
## 55 0 0 0 0 0 190 0
## 56 0 0 0 0 0 98 0
## 57 0 0 0 0 0 93 0
## 58 0 0 0 0 0 111 0
## 59 0 0 0 0 0 95 0
## 60 0 0 0 0 0 141 0
## 61 0 0 0 0 0 55 0
## 62 0 0 0 0 0 92 0
## 63 0 0 0 0 0 41 0
## 64 0 0 0 0 0 34 0
## 65 0 0 0 0 0 6 0
## 66 0 0 0 0 0 3 0
## [1] "Frequency table after encoding"
## ager. Average Age (years)
## Less than 15 15-24 25-34 35-44 45-54 55 and older
## 1281 6950 3770 2399 1704 959
## [1] "Inspect value labels and relabel as necessary"
## Less than 15 15-24 25-34 35-44 45-54 55 and older NA
## 1 2 3 4 5 6 7
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1,
"25-34" =2,
"35-44" =3,
"45-54" =4,
"55-64" =5,
"65 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53
## 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 52 36 35 37 51 34 42 28 41 44 31 36 34 37 34 32 28 31 29
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 <NA>
## 28 23 27 19 25 32 15 18 17 18 13 13 9 12 2 1 15052
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 18 0 0 0 0 0 0
## 17 44 0 0 0 0 0 0
## 18 60 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 63 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 72 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 60 0 0 0 0 0
## 26 0 49 0 0 0 0 0
## 27 0 41 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 59 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 56 0 0 0 0 0
## 33 0 54 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 37 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 34 0 0 0 0
## 41 0 0 42 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 37 0 0 0
## 49 0 0 0 34 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 28 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 29 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 23 0 0
## 56 0 0 0 0 27 0 0
## 57 0 0 0 0 19 0 0
## 58 0 0 0 0 25 0 0
## 59 0 0 0 0 32 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 13 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older <NA>
## 537 510 400 320 207 37 15052
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 1 2 3 4 5 6 7
# Remove variables derivedfrom ordinal variables that have been recoded
remove_ordinals <- c("age2", "ln_hhincome", "ln_exp")
mydata <- mydata[!names(mydata) %in% remove_ordinals]
# !!!Include relevant variables in list below
indirect_PII <- c("d6",
"d9",
"inc17",
"inc23",
"em16_2",
"cm29",
"me3",
"me5",
"me13",
"me14",
"ME_16",
"edu2",
"ageg",
"ageg2",
"incomecat",
"inc16r",
"d6r",
"ethnic",
"caste",
"me3r",
"me14r",
"me5r",
"me7r",
"ethnicity",
"education",
"childnum")
capture_tables (indirect_PII)
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("childnum", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more children
## [1] "Frequency table before encoding"
## childnum. Number of children
## 0 1 2 3 4 5 6 7 8 9 10 11 <NA>
## 7247 1989 2942 2301 1205 679 372 132 52 12 7 6 119
## [1] "Frequency table after encoding"
## childnum. Number of children
## 0 1 2 3 4 5 or more <NA>
## 7247 1989 2942 2301 1205 1260 119
mydata <- top_recode ("d20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more children
## [1] "Frequency table before encoding"
## d20. How many children do you have?
## 0 1 2 3 4 5 6 7 8 9 10 11 888
## 7247 1989 2942 2301 1205 679 372 132 52 12 7 6 119
## [1] "Frequency table after encoding"
## d20. How many children do you have?
## 0 1 2 3 4 5 or more 888
## 7247 1989 2942 2301 1205 1260 119
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$inc17[mydata$inc17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## inc17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 10 15 30 35 36 300 400 500 600 777 888 999 1000
## 564 3 3 4 3 4 3 9 27 3 201 4 1020 106
## 1200 1250 1300 1400 1500 1600 2000 2083 2100 2200 2300 2500 2600 3000
## 38 4 3 3 118 8 382 3 3 3 4 73 3 570
## 3500 3800 4000 4100 4500 5000 5500 5600 6000 6500 7000 7500 8000 8500
## 23 3 535 4 33 1493 8 3 579 7 520 29 509 3
## 9000 9500 10000 10400 10500 11000 12000 12200 12500 12800 13000 14000 15000 16000
## 255 4 1709 3 7 41 671 4 23 3 179 94 1636 140
## 17000 17200 18000 19000 20000 21000 22000 23000 24000 24400 24500 25000 26000 27000
## 82 4 178 13 1420 28 75 20 31 4 3 816 26 12
## 28000 29000 30000 31000 32000 33000 34500 35000 36000 38000 39000 40000 41000 41600
## 27 9 751 11 21 18 4 254 17 3 3 342 2 3
## 42000 43000 44000 45000 50000 52000 53000 55000 58000 59500 60000 65000 66000 70000
## 7 9 3 83 414 3 4 27 8 3 170 4 3 44
## 75000 80000 85000 90000 1e+05 108000 109000 110000 120000 125000 130000 150000 180000 190000
## 16 56 13 21 115 3 3 4 13 8 3 50 3 3
## 2e+05 3e+05 320000 350000 4e+05 5e+05 630000 7e+05 1500000 2e+06
## 74 3 4 4 4 3 3 4 3 4
## [1] "Frequency table after encoding"
## inc17. Approximately what was your household's cash income in the last month? (in NRS)
## 0 10 15 30 35 36 300 400
## 564 3 3 4 3 4 3 9
## 500 600 777 888 999 1000 1200 1250
## 27 3 201 4 1020 106 38 4
## 1300 1400 1500 1600 2000 2083 2100 2200
## 3 3 118 8 382 3 3 3
## 2300 2500 2600 3000 3500 3800 4000 4100
## 4 73 3 570 23 3 535 4
## 4500 5000 5500 5600 6000 6500 7000 7500
## 33 1493 8 3 579 7 520 29
## 8000 8500 9000 9500 10000 10400 10500 11000
## 509 3 255 4 1709 3 7 41
## 12000 12200 12500 12800 13000 14000 15000 16000
## 671 4 23 3 179 94 1636 140
## 17000 17200 18000 19000 20000 21000 22000 23000
## 82 4 178 13 1420 28 75 20
## 24000 24400 24500 25000 26000 27000 28000 29000
## 31 4 3 816 26 12 27 9
## 30000 31000 32000 33000 34500 35000 36000 38000
## 751 11 21 18 4 254 17 3
## 39000 40000 41000 41600 42000 43000 44000 45000
## 3 342 2 3 7 9 3 83
## 50000 52000 53000 55000 58000 59500 60000 65000
## 414 3 4 27 8 3 170 4
## 66000 70000 75000 80000 85000 90000 1e+05 108000
## 3 44 16 56 13 21 115 3
## 109000 110000 120000 125000 130000 150000 180000 190000
## 3 4 13 8 3 50 3 3
## 2e+05 or more
## 106
percentile_99.5 <- floor(quantile(mydata$inc23[mydata$inc23!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc23", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## inc23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777 800 900 999 1000 1070 1100 1200 1300
## 4 3 10 3 59 3 3 14 3 4 582 135 3 4 47 7
## 1400 1500 1600 1800 2000 2200 2400 2500 2600 2800 3000 3500 4000 4009 4500 5000
## 3 128 4 4 564 11 3 209 4 3 1061 97 1120 3 56 2461
## 5500 6000 6500 7000 7500 8000 9000 10000 11000 12000 13000 13500 14000 15000 16000 17000
## 6 1192 3 980 11 910 353 2615 22 629 157 4 55 1517 45 30
## 18000 19000 20000 21000 22000 23000 24000 25000 27000 28000 30000 32000 32500 35000 40000 42000
## 85 4 847 16 45 14 13 327 12 3 268 7 3 57 76 2
## 45000 50000 60000 70000 80000 90000 1e+05 120000 140000 2e+05 9e+05
## 13 56 30 10 7 4 7 3 3 3 4
## [1] "Frequency table after encoding"
## inc23. In a typical month, what is your total household expenditure? (in NRS)
## 15 200 300 400 500 600 700 777
## 4 3 10 3 59 3 3 14
## 800 900 999 1000 1070 1100 1200 1300
## 3 4 582 135 3 4 47 7
## 1400 1500 1600 1800 2000 2200 2400 2500
## 3 128 4 4 564 11 3 209
## 2600 2800 3000 3500 4000 4009 4500 5000
## 4 3 1061 97 1120 3 56 2461
## 5500 6000 6500 7000 7500 8000 9000 10000
## 6 1192 3 980 11 910 353 2615
## 11000 12000 13000 13500 14000 15000 16000 17000
## 22 629 157 4 55 1517 45 30
## 18000 19000 20000 21000 22000 23000 24000 25000
## 85 4 847 16 45 14 13 327
## 27000 28000 30000 32000 32500 35000 40000 42000
## 12 3 268 7 3 57 76 2
## 45000 50000 or more
## 13 127
# Encode caste
mydata <- encode_location (variables= "D_3", missing=999999)
## [1] "Frequency table before encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## chhetri BRAHMAN (HILL) magar tharu tamang newar
## 4334 2424 1017 654 3730 1361
## muslim kami yadav rai gurung DAMAIN/DHOLI
## 23 636 69 44 177 315
## limbu thakuri sarki teli CHAMAR/HARIJAN/RAM koiri
## 8 258 235 21 7 260
## kurmi DUSADH/PASWAN/PASI sonar BRAHMAN (TARAI) GHARTI/BHUJEL malla
## 4 30 34 30 168 3
## kalwar kumal HAJAM/THAKUR sunuwar sudhi lohar
## 38 117 9 10 3 15
## tatma khatwe majhi nuniya kumhar danuwar
## 9 9 24 8 10 13
## CHEPANG/PRAJA haluwai rajput kayastha badhae marwadi
## 319 4 17 27 3 15
## thami darai pahari dom bote ADIBASI/JANAJATI
## 61 51 34 4 4 3
## badi OTHER CASTE <NA>
## 12 395 7
## [1] "Frequency table after encoding"
## D_3. What is your ethnic background? [You do not need to read the response choices
## 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
## 235 3 8 9 2424 3 9 38 27 4 10 654 10 7 24 15 21 319 13 177 34 34 30
## 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
## 260 4 3 258 636 12 9 8 117 3 1017 44 61 3730 69 315 30 4 1361 15 4334 51 168
## 430 431 432 433 <NA>
## 23 17 395 4 7
# Recode religion
break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
"Buddhist" = 2,
"Other" = 3,
"Refused" = 4,
"Not applicable" = 5,
"Don't know" = 6)
mydata <- ordinal_recode (variable="d6", break_points=break_rel, missing=999, value_labels=labels_rel)
## [1] "Frequency table before encoding"
## d6. What is your religious background? [You do not need to read the response choi
## hindu buddhist islam kirant christian OTHER RELIGION <NA>
## 13305 3278 41 7 393 10 29
## recoded
## [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
## 1 13305 0 0 0 0 0
## 2 0 3278 0 0 0 0
## 3 0 0 41 0 0 0
## 4 0 0 7 0 0 0
## 6 0 0 393 0 0 0
## 9 0 0 10 0 0 0
## [1] "Frequency table after encoding"
## d6. What is your religious background? [You do not need to read the response choi
## Hindu Buddhist Other <NA>
## 13305 3278 451 29
## [1] "Inspect value labels and relabel as necessary"
## Hindu Buddhist Other Refused Not applicable Don't know
## 1 2 3 4 5 6
# Recode education into standard categories
break_edu <- c(0,6,9,11,12,13,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Refused"= 9,
"Does not apply" = 10,
"Don't Know" = 11)
mydata <- ordinal_recode (variable="d4", break_points=break_edu, missing=999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## d4. What is your highest completed education level?<U+00A0> [You do not need to read the
## PRE-SCHOOL/KINDERGARTEN CLASS 1 CLASS 2
## 23 321 578
## CLASS 3 CLASS 4 CLASS 5
## 634 802 1180
## CLASS 6 CLASS 7 CLASS 8
## 925 1271 1401
## CLASS 9 CLASS 10 slc
## 1023 1078 2577
## CLASS 12/INTERMEDIATE LEVEL BACHELOR LEVEL MASTER LEVEL
## 1479 261 81
## LITERATE, BUT NEVER ATTENDED SCHOOL ILLITERATE, AND NEVER ATTENDED SCHOOL <NA>
## 1277 2104 48
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,17) [17,18) [18,777) [777,888) [888,999) [999,1e+03)
## 0 23 0 0 0 0 0 0 0 0 0 0
## 1 321 0 0 0 0 0 0 0 0 0 0
## 2 578 0 0 0 0 0 0 0 0 0 0
## 3 634 0 0 0 0 0 0 0 0 0 0
## 4 802 0 0 0 0 0 0 0 0 0 0
## 5 1180 0 0 0 0 0 0 0 0 0 0
## 6 0 925 0 0 0 0 0 0 0 0 0
## 7 0 1271 0 0 0 0 0 0 0 0 0
## 8 0 1401 0 0 0 0 0 0 0 0 0
## 9 0 0 1023 0 0 0 0 0 0 0 0
## 10 0 0 1078 0 0 0 0 0 0 0 0
## 11 0 0 0 2577 0 0 0 0 0 0 0
## 12 0 0 0 0 1479 0 0 0 0 0 0
## 13 0 0 0 0 0 261 0 0 0 0 0
## 14 0 0 0 0 0 81 0 0 0 0 0
## 17 0 0 0 0 0 0 1277 0 0 0 0
## 18 0 0 0 0 0 0 0 2104 0 0 0
## [1] "Frequency table after encoding"
## d4. What is your highest completed education level?<U+00A0> [You do not need to read the
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 3538 3597 2101
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 2577 1479 342
## Literate, but never attended school Illiterate, and never attended school <NA>
## 1277 2104 48
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Refused
## 7 8 9
## Does not apply Don't Know
## 10 11
# Recode into married vs "others"
break_mar <- c(1,2,1000)
labels_mar <- c("Married" =1, "Others" =2)
mydata <- ordinal_recode (variable="d9", break_points=break_mar, missing=999999, value_labels=labels_mar)
## [1] "Frequency table before encoding"
## d9. What is your marital status?
## Married Separated/Divorced Widowed Never Married
## 10418 100 281 6264
## recoded
## [1,2) [2,1e+03) [1e+03,1e+06)
## 1 10418 0 0
## 2 0 100 0
## 3 0 281 0
## 4 0 6264 0
## [1] "Frequency table after encoding"
## d9. What is your marital status?
## Married Others
## 10418 6645
## [1] "Inspect value labels and relabel as necessary"
## Married Others
## 1 2
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c("D_1", "edu2", "age") ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial # No cases violate 2-anonimity.
## The input dataset consists of 17063 rows and 410 variables.
## --> Categorical key variables: D_1, edu2, age
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_1 2 (2) 8531.500 (8531.500) 8495 (8495)
## edu2 8 (8) 2430.714 (2430.714) 342 (342)
## age 6 (6) 2843.833 (2843.833) 959 (959)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 0 (0.000%)
## - 5-anonymity: 9 (0.053%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("HTV_1_10_TEXT", "HTV_3_11_TEXTx3", "HTV_3_11_TEXTx3_Translation")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata <- mydata[!names(mydata) %in% "HTV_1_10_TEXT"]
mydata <- mydata[!names(mydata) %in% "HTV_3_11_TEXTx3"]
# !!!No GPS data
mydata <- mydata[!names(mydata) %in% "htv_1_1r"]
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))