rm(list=ls(all=t))
Dataset to be processed (e.g. "Nepal Round 3_FinalClean.dta")
filename <- "Nepal Round 3_FinalClean" # !!!Update filename
source ("functions_1.7.R")
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables
# !!!Include any Direct PII variables
dropvars <- c("IDR3_18", "IDR3_19", "RvwName", "IDR3_18", "IDR3_19", "LE_reportedby", "flag_reportedby")
mydata <- mydata[!names(mydata) %in% dropvars]
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables=c("Srvyr", "surveyor"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal ashish.shrestha bhanu.bhakta.dhakal
## 79 96 94 98 132 79
## dev.raj.nepal dhan.kumari.darlami dilip.joshi gita.maharjan gyanendra.parajuli kajiman.mahatara
## 240 86 216 103 216 234
## kamala.sharma mani.ram.dahal manjula.giri min.kumari.shrestha nabina.khadka niraj.shrestha
## 80 242 99 86 80 89
## prahlad.mainali pramila.shrestha pratika.shrestha rabischandra.bhatta ram.kumar.acharya sajina.shrestha
## 239 77 88 92 91 78
## sandip.shrestha sanjay.pokharel sapana.gautam sarita.shrestha sunil.shrestha tirtha.maya.rai
## 269 68 84 100 270 105
## yamuna.karki
## 87
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 79 96 94 98 132 79 240 86 216 103 216 234 80 242 99 86 80 89 239 77 88 92 91 78 269 68 84 100 270 105
## 31
## 87
## [1] "Frequency table before encoding"
## surveyor. Surveyor
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal ashish.shrestha
## 1983 79 96 90 98 82
## bhanu.bhakta.dhakal dev.raj.nepal dhan.kumari.darlami gita.maharjan kamala.sharma manjula.giri
## 77 2 85 99 79 99
## min.kumari.shrestha nabina.khadka niraj.shrestha pramila.shrestha pratika.shrestha rabischandra.bhatta
## 86 80 85 77 85 87
## ram.kumar.acharya sajina.shrestha sandip.shrestha sapana.gautam sarita.shrestha tirtha.maya.rai
## 88 73 97 80 99 105
## yamuna.karki
## 86
## [1] "Frequency table after encoding"
## surveyor. Surveyor
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1983 79 96 90 98 82 77 2 85 99 79 99 86 80 85 77 85 87 88 73 97 80 99 105
## 25
## 86
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("vdc", "IDR3_6_19","IDR3_6_22","IDR3_6_23","IDR3_6_24","IDR3_6_26","IDR3_6_30","IDR3_6_31","IDR3_6_35")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## vdc. VDC code
## Barahathawa Dhungrekhola Lalbandi Malangawa NP Netraganj
## 64 66 126 63 64
## Raniganj Sankarpur Bhimeswor NP Bocha Dandakharka
## 64 62 61 39 62
## Fasku Katakuti Lamidanda Melung Pawati
## 62 63 62 64 64
## Badegau Irkhu BhoteNamlang Talamarang Ichok
## 64 62 66 63 63
## Kadambas Langarche Melamchi Anaikot Baluwapati Deupur
## 64 62 64 63 62
## Chalal Ganeshthan Kalati Bhumidanda Mahankal Chaur Methinkot Patalekhet
## 67 62 61 62 64
## Raviopi Balkot Changunarayan Chitapol Duwakot
## 62 59 62 63 63
## Gundu Madhyapur Thimi NP Nankhel Sirutar Baireni
## 62 66 62 58 62
## Dhussa Khari Kiranchok Naubise Salyantar
## 64 62 63 64 63
## Sunaula Bazar Thakre Chitlang Churiyamai Fakhel
## 62 64 61 62 62
## Padampokhari Kulekhani Nibuwatar Shreepur Chhatiwan Sisneri Mahadevsthan
## 65 62 60 62 63
## Birendranagar Jutpani Kathar Khairahani Padampur
## 63 63 63 62 61
## Parbatipur Piple Shaktikhor
## 62 65 65
## [1] "Frequency table after encoding"
## vdc. VDC code
## 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
## 62 62 64 62 65 61 62 63 64 62 64 63 62 62 59 62 62 61 63 64 63 62 66 67 62 64 63 64 63 64
## 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
## 63 63 60 63 63 62 64 62 64 62 65 62 66 64 62 63 62 61 63 62 66 58 63 61 62 63 62 64 65 39
## 527 528 529 530
## 62 64 62 63
## [1] "Frequency table before encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa NP Netraganj Raniganj Sankarpur 999999
## 64 66 63 63 63 64 64 62 3488
## [1] "Frequency table after encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## 904 905 906 907 908 909 910 912 999999
## 64 63 64 62 63 64 66 63 3488
## [1] "Frequency table before encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## Bhimeswor NP Bocha Dandakharka Fasku Katakuti Lamidanda Melung Pawati 999999
## 61 39 62 62 63 62 64 64 3520
## [1] "Frequency table after encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## 876 877 878 879 880 881 883 884 999999
## 64 62 62 64 62 61 39 63 3520
## [1] "Frequency table before encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## Badegau Irkhu BhoteNamlang Talamarang Ichok Kadambas Langarche Melamchi 999999
## 64 62 66 64 63 64 62 64 3488
## [1] "Frequency table after encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## 513 514 515 516 517 518 520 521 999999
## 62 64 66 62 64 63 64 64 3488
## [1] "Frequency table before encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## Anaikot Baluwapati Deupur Chalal Ganeshthan Kalati Bhumidanda Mahankal Chaur Methinkot
## 63 62 67 62 61 62
## Patalekhet Raviopi 999999
## 64 62 3494
## [1] "Frequency table after encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## 689 690 691 692 693 694 695 697 999999
## 62 62 64 67 63 62 61 62 3494
## [1] "Frequency table before encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## Balkot Changunarayan Chitapol Duwakot Gundu Madhyapur Thimi NP
## 59 62 63 63 62 66
## Nankhel Sirutar 999999
## 62 58 3502
## [1] "Frequency table after encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## 405 406 407 408 410 411 412 413 999999
## 63 62 62 59 62 63 58 66 3502
## [1] "Frequency table before encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## Baireni Dhussa Khari Kiranchok Naubise Salyantar Sunaula Bazar Thakre
## 62 64 62 63 64 63 62 64
## 999999
## 3493
## [1] "Frequency table after encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## 634 635 636 637 639 640 641 642 999999
## 62 63 62 64 62 63 64 64 3493
## [1] "Frequency table before encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## Chitlang Churiyamai Fakhel Padampokhari Kulekhani
## 61 62 62 65 62
## Nibuwatar Shreepur Chhatiwan Sisneri Mahadevsthan 999999
## 60 62 63 3500
## [1] "Frequency table after encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## 798 799 800 801 802 803 804 805 999999
## 63 62 62 62 62 65 60 61 3500
## [1] "Frequency table before encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## Birendranagar Jutpani Kathar Khairahani Padampur Parbatipur Piple Shaktikhor
## 63 63 63 62 61 62 65 65
## 999999
## 3493
## [1] "Frequency table after encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## 875 876 877 878 879 880 881 882 999999
## 63 65 63 62 63 61 62 65 3493
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1,
"25-34" =2,
"35-44" =3,
"45-54" =4,
"55-64" =5,
"65 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 18 44 60 62 64 72 68 73 78 61 50 42 35 53 60 50 56 55 53 52 36 35 38 51 35 43 28 41 44 31 36 34 38 34 32 29 31 31 28 23
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## 27 19 26 32 15 18 17 18 15 13 9 12 2 1
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 18 0 0 0 0 0 0
## 17 44 0 0 0 0 0 0
## 18 60 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 64 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 73 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 61 0 0 0 0 0
## 26 0 50 0 0 0 0 0
## 27 0 42 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 60 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 56 0 0 0 0 0
## 33 0 55 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 38 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 35 0 0 0 0
## 41 0 0 43 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 38 0 0 0
## 49 0 0 0 34 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 29 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 31 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 23 0 0
## 56 0 0 0 0 27 0 0
## 57 0 0 0 0 19 0 0
## 58 0 0 0 0 26 0 0
## 59 0 0 0 0 32 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 15 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## 999999 0 0 0 0 0 0 1969
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 539 515 403 324 210 37 1969
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 1 2 3 4 5 6 7
# !!!Include relevant variables in list below
indirect_PII <- c("HC2_O1", "HC2_O2", "HC2_O3", "HC2_O4", "HC2_O5", "HC2_O6","H2_12_TEXT","HC3","HC4_1",
"HC4_2","HC4_3","HC4_4","D_4","Inc_17","P1","P1A","P2","P3","P3A","P4","P4A","P8_O1",
"P8_O2","P8_O3","P8_3_number","P8_4_number","P8_5_number","P12A","P12A_TEXT","P13A_O1",
"P13A_O2","P13A_10_TEXT","P9B","P10B","P12B","P13B_O1","P13B_O2","P13B_10_TEXT","P9C_I1",
"P10C_I1","P11C_I1","P11_A3_I1","P12C_I1","P12C_TEXT_I1","P13C_O1_I1","P13C_10_TEXT_I1",
"P9C_I2","P10C_I2","P11C_I2","P11_A3_I2","P12C_I2","P9D_I1","P10D_I1","P11D_I1","P11_A4_I1",
"P12D_I1","P13D_O1_I1","P13D_10_TEXT_I1","P9D_I2","P10D_I2","P11D_I2","P11_A4_I2","P12D_I2",
"P13D_O1_I2","P13D_O2_I2","P9E_I1","P10E_I1","P11E_I1","P11_A5_I1","P12E_I1","P13E_O1_I1",
"P13E_O2_I1","P9E_I2","P10E_I2","P11E_I2","P11_A5_I2","P12E_I2","P13E_O1_I2","P14E_O1_I2",
"P9E_I3","P10E_I3","P11E_I3","P11_A5_I3","P12E_I3","P13E_O1_I3","P14E_O1_I3","P20A","P19B",
"P18C_I1","P19C_I1","P20C_I1","P18C_I2","P19C_I2","P20C_I2","P18D_I1","P19D_I1","P20D_I1",
"P18D_I2","P19D_I2","P20D_I2","P18E_I1","P19E_I1","P20E_I1","NEW_2_cl_I1","P19_cl_I1",
"D_9_cl_I1","D_4_cl_I1","NEW_2_cl_I2","P19_cl_I2","D_9_cl_I2","D_4_cl_I2","NEW_2_cl_I3",
"P19_cl_I3","D_9_cl_I3","D_4_cl_I3","NEW_2_cl_I4","P19_cl_I4","D_4_cl_I4","NEW_2_cl_I5",
"P19_cl_I5","D_9_cl_I5","D_4_cl_I5","NEW_2_cl_I6","P19_cl_I6","D_4_cl_I6","D_8_cl_I6","E2_2")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
mydata <- mydata[!names(mydata) %in% "H2_12_TEXT"] # Drop as actually verbatim data in Nepali
# Top code household composition variables with large and unusual numbers
mydata <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages.
## [1] "Frequency table before encoding"
## E2_2. What language did you use other than Nepali?
## MAITHILI NEWAR 999999
## 1 6 3990
## [1] "Frequency table after encoding"
## E2_2. What language did you use other than Nepali?
## 1 2 3
## 1 6 3990
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table before encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 6 7 8 9 13 888 999999
## 757 545 475 170 52 16 4 1 1 1 2 3 1970
## [1] "Frequency table after encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 or more 888 999999
## 757 545 475 170 52 25 3 1970
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700 777 888 999 1000 1200 1500 1600 2000 2400
## 133 1 1 1 1 5 2 1 4 1 22 3 1 4 5 27 2
## 2500 2600 3000 3500 4000 4500 4800 5000 6000 6500 7000 8000 8500 9000 10000 11000 11500
## 8 1 33 2 42 3 1 101 35 1 38 20 1 24 193 3 1
## 12000 12846 13000 14000 14500 15000 15500 16000 17000 18000 19000 19135 20000 21000 22000 22500 23000
## 54 1 11 10 1 175 1 24 8 16 6 1 222 2 12 1 5
## 24000 25000 26000 27000 27500 28000 30000 32000 33000 34000 35000 36000 37000 40000 41000 45000 48000
## 5 105 1 5 1 2 177 2 1 1 54 4 1 90 3 17 1
## 50000 54000 55000 57000 60000 62000 65000 66000 67000 68000 70000 75000 79000 79500 80000 85000 95000
## 115 2 4 2 48 1 7 1 1 1 15 4 1 1 13 1 1
## 1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000 160000 170000 2e+05 240000 250000 3e+05 320000
## 27 1 1 1 2 1 1 1 1 11 1 1 9 1 1 3 1
## 5e+05 6e+05 7e+05 999999 1e+06
## 1 1 1 1970 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700
## 133 1 1 1 1 5 2 1
## 777 888 999 1000 1200 1500 1600 2000
## 4 1 22 3 1 4 5 27
## 2400 2500 2600 3000 3500 4000 4500 4800
## 2 8 1 33 2 42 3 1
## 5000 6000 6500 7000 8000 8500 9000 10000
## 101 35 1 38 20 1 24 193
## 11000 11500 12000 12846 13000 14000 14500 15000
## 3 1 54 1 11 10 1 175
## 15500 16000 17000 18000 19000 19135 20000 21000
## 1 24 8 16 6 1 222 2
## 22000 22500 23000 24000 25000 26000 27000 27500
## 12 1 5 5 105 1 5 1
## 28000 30000 32000 33000 34000 35000 36000 37000
## 2 177 2 1 1 54 4 1
## 40000 41000 45000 48000 50000 54000 55000 57000
## 90 3 17 1 115 2 4 2
## 60000 62000 65000 66000 67000 68000 70000 75000
## 48 1 7 1 1 1 15 4
## 79000 79500 80000 85000 95000 1e+05 103000 104000
## 1 1 13 1 1 27 1 1
## 110000 115000 117000 125000 130000 135000 150000 160000
## 1 2 1 1 1 1 11 1
## 170000 2e+05 or more 999999
## 1 19 1970
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20') ##!!! Replace with candidate categorical demo vars
# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3997 rows and 1115 variables.
## --> Categorical key variables: D_4, IDR3_20
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 18 (18) 222.056 (222.056) 1 (1)
## IDR3_20 7 (7) 571.000 (571.000) 37 (37)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 10 (0.250%)
## - 3-anonymity: 22 (0.550%)
## - 5-anonymity: 51 (1.276%)
##
## ----------------------------------------------------------------------
# Recode education attainment of adults to reduce risk of re-identification
break_edu <- c(0,6,9,11,12,13,15,16,17)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"NA"= 9)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Pre-school/Kindergarten CLASS 1 CLASS 2
## 1 31 54
## CLASS 3 CLASS 4 CLASS 5
## 71 75 151
## CLASS 6 CLASS 7 CLASS 8
## 69 85 120
## CLASS 9 CLASS 10 SLC
## 84 104 298
## CLASS 12/Intermediate level Bachelor level Post-Secondary Level (e.g., MA, PhD)
## 266 62 9
## Literate, but never attended school Illiterate, and never attended school
## 305 242
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,17) [17,1e+06)
## 0 1 0 0 0 0 0 0 0 0
## 1 31 0 0 0 0 0 0 0 0
## 2 54 0 0 0 0 0 0 0 0
## 3 71 0 0 0 0 0 0 0 0
## 4 75 0 0 0 0 0 0 0 0
## 5 151 0 0 0 0 0 0 0 0
## 6 0 69 0 0 0 0 0 0 0
## 7 0 85 0 0 0 0 0 0 0
## 8 0 120 0 0 0 0 0 0 0
## 9 0 0 84 0 0 0 0 0 0
## 10 0 0 104 0 0 0 0 0 0
## 11 0 0 0 298 0 0 0 0 0
## 12 0 0 0 0 266 0 0 0 0
## 13 0 0 0 0 0 62 0 0 0
## 14 0 0 0 0 0 9 0 0 0
## 15 0 0 0 0 0 0 305 0 0
## 16 0 0 0 0 0 0 0 242 0
## 999999 0 0 0 0 0 0 0 0 1970
## [1] "Frequency table after encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 383 274 188
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 298 266 71
## Literate, but never attended school Illiterate, and never attended school NA
## 305 242 1970
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school NA
## 7 8 9
break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Does not apply"= 9,
"Don't Know"= 10,
"NA"= 11)
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## CLASS 1 CLASS 2 CLASS 3
## 15 48 48
## CLASS 4 CLASS 5 CLASS 6
## 68 115 57
## CLASS 7 CLASS 8 CLASS 9
## 77 114 69
## CLASS 10 SLC CLASS 12/Intermediate level
## 87 188 127
## Bachelor level Post-Secondary Level (e.g., MA, PhD) Literate, but never attended school
## 35 11 238
## Illiterate, and never attended school Does not apply Don't know
## 281 4 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 1 15 0 0 0 0 0 0 0 0 0 0
## 2 48 0 0 0 0 0 0 0 0 0 0
## 3 48 0 0 0 0 0 0 0 0 0 0
## 4 68 0 0 0 0 0 0 0 0 0 0
## 5 115 0 0 0 0 0 0 0 0 0 0
## 6 0 57 0 0 0 0 0 0 0 0 0
## 7 0 77 0 0 0 0 0 0 0 0 0
## 8 0 114 0 0 0 0 0 0 0 0 0
## 9 0 0 69 0 0 0 0 0 0 0 0
## 10 0 0 87 0 0 0 0 0 0 0 0
## 11 0 0 0 188 0 0 0 0 0 0 0
## 12 0 0 0 0 127 0 0 0 0 0 0
## 13 0 0 0 0 0 35 0 0 0 0 0
## 14 0 0 0 0 0 11 0 0 0 0 0
## 15 0 0 0 0 0 0 238 0 0 0 0
## 16 0 0 0 0 0 0 0 281 0 0 0
## 888 0 0 0 0 0 0 0 0 0 4 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 2412
## [1] "Frequency table after encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 294 248 156
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 188 127 46
## Literate, but never attended school Illiterate, and never attended school Don't Know
## 238 281 4
## NA
## 2415
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Pre-school/Kindergarten CLASS 1 CLASS 2
## 1 12 27
## CLASS 3 CLASS 4 CLASS 5
## 33 23 61
## CLASS 6 CLASS 7 CLASS 8
## 15 21 44
## CLASS 9 CLASS 10 SLC
## 22 26 48
## CLASS 12/Intermediate level Bachelor level Post-Secondary Level (e.g., MA, PhD)
## 29 4 2
## Literate, but never attended school Illiterate, and never attended school Refused to answer
## 121 111 1
## Does not apply Don't know
## 51 11
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 0 1 0 0 0 0 0 0 0 0 0 0
## 1 12 0 0 0 0 0 0 0 0 0 0
## 2 27 0 0 0 0 0 0 0 0 0 0
## 3 33 0 0 0 0 0 0 0 0 0 0
## 4 23 0 0 0 0 0 0 0 0 0 0
## 5 61 0 0 0 0 0 0 0 0 0 0
## 6 0 15 0 0 0 0 0 0 0 0 0
## 7 0 21 0 0 0 0 0 0 0 0 0
## 8 0 44 0 0 0 0 0 0 0 0 0
## 9 0 0 22 0 0 0 0 0 0 0 0
## 10 0 0 26 0 0 0 0 0 0 0 0
## 11 0 0 0 48 0 0 0 0 0 0 0
## 12 0 0 0 0 29 0 0 0 0 0 0
## 13 0 0 0 0 0 4 0 0 0 0 0
## 14 0 0 0 0 0 2 0 0 0 0 0
## 15 0 0 0 0 0 0 121 0 0 0 0
## 16 0 0 0 0 0 0 0 111 0 0 0
## 777 0 0 0 0 0 0 0 0 1 0 0
## 888 0 0 0 0 0 0 0 0 0 51 0
## 999 0 0 0 0 0 0 0 0 0 0 11
## 999999 0 0 0 0 0 0 0 0 0 0 3334
## [1] "Frequency table after encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 157 80 48
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 48 29 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 121 111 1
## Don't Know NA
## 51 3345
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## CLASS 1 CLASS 2 CLASS 3
## 3 13 10
## CLASS 4 CLASS 5 CLASS 6
## 21 24 13
## CLASS 7 CLASS 8 CLASS 9
## 6 20 5
## CLASS 10 SLC CLASS 12/Intermediate level
## 9 23 5
## Bachelor level Literate, but never attended school Illiterate, and never attended school
## 1 227 260
## Does not apply Don't know
## 20 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 1 3 0 0 0 0 0 0 0 0 0 0
## 2 13 0 0 0 0 0 0 0 0 0 0
## 3 10 0 0 0 0 0 0 0 0 0 0
## 4 21 0 0 0 0 0 0 0 0 0 0
## 5 24 0 0 0 0 0 0 0 0 0 0
## 6 0 13 0 0 0 0 0 0 0 0 0
## 7 0 6 0 0 0 0 0 0 0 0 0
## 8 0 20 0 0 0 0 0 0 0 0 0
## 9 0 0 5 0 0 0 0 0 0 0 0
## 10 0 0 9 0 0 0 0 0 0 0 0
## 11 0 0 0 23 0 0 0 0 0 0 0
## 12 0 0 0 0 5 0 0 0 0 0 0
## 13 0 0 0 0 0 1 0 0 0 0 0
## 15 0 0 0 0 0 0 227 0 0 0 0
## 16 0 0 0 0 0 0 0 260 0 0 0
## 888 0 0 0 0 0 0 0 0 0 20 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 3334
## [1] "Frequency table after encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 71 39 14
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 23 5 1
## Literate, but never attended school Illiterate, and never attended school Don't Know
## 227 260 20
## NA
## 3337
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## CLASS 2 CLASS 3 CLASS 4
## 2 2 1
## CLASS 5 CLASS 9 SLC
## 2 1 1
## CLASS 12/Intermediate level Literate, but never attended school Illiterate, and never attended school
## 2 29 56
## Don't know
## 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 2 2 0 0 0 0 0 0 0 0 0 0
## 3 2 0 0 0 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0 0 0 0 0
## 5 2 0 0 0 0 0 0 0 0 0 0
## 9 0 0 1 0 0 0 0 0 0 0 0
## 11 0 0 0 1 0 0 0 0 0 0 0
## 12 0 0 0 0 2 0 0 0 0 0 0
## 15 0 0 0 0 0 0 29 0 0 0 0
## 16 0 0 0 0 0 0 0 56 0 0 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 3898
## [1] "Frequency table after encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## Primary or less (0-5) Secondary (9-10) SLC (11)
## 7 1 1
## CLASS 12/Intermediate level (12) Literate, but never attended school Illiterate, and never attended school
## 2 29 56
## NA
## 3901
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
# Re-run to check 2-anonimity
selectedKeyVars = c('D_4', 'IDR3_20')
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 3997 rows and 1115 variables.
## --> Categorical key variables: D_4, IDR3_20
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 9 (9) 444.111 (444.111) 71 (71)
## IDR3_20 7 (7) 571.000 (571.000) 37 (37)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 2 (0.050%)
## - 3-anonymity: 6 (0.150%)
## - 5-anonymity: 19 (0.475%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 2 x 2
## D_4 IDR3_20
## <dbl+lbl> <dbl+lbl>
## 1 6 [Bachelor/Postgraduate level] 5 [55-64]
## 2 9 [NA] 4 [45-54]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## D_4 IDR3_20
## 826 NA 5
## 1075 NA 4
mydata [notAnon,"D_4"] <- 9
# !!! Identify open-end variables here:
open_ends <- c("SrvyrComment",
"H2_12_TEXT_Translation",
"HTNx3_2_14_TEXT_Translation",
"HTN_5x3_TEXT_Translation",
"HTV_1_10_TEXTx3_Translation",
"HTV_3_11_TEXTx3_Translation",
"CPR5i_TEXT_Translation",
"G1_00_08_TEXT_Translation",
"P13A_10_TEXT_Translation",
"P14A_12_TEXT_Translation",
"SIMPOC7A_10_TEXT_Translation",
"P13B_10_TEXT_Translation",
"P14B_12_TEXT_Translation",
"SIMPOC7B_10_TEXT_Translation",
"P13C_10_TEXT_I1_Translation",
"P14C_12_TEXT_I1_Translation",
"SIMPOC7C_10_TEXT_I1_Translation",
"P14C_12_TEXT_I2_Translation",
"P13D_10_TEXT_I1_Translation",
"P14D_12_TEXT_I1_Translation",
"P14D_12_TEXT_I2_Translation",
"P13E_10_TEXT_I1_Translation",
"P14E_12_TEXT_I1_Translation",
"SIMPOC7E_10_TEXT_I1_Translation",
"P14E_12_TEXT_I2_Translation",
"P14E_12_TEXT_I3_Translation",
"NEW_3_12_TEXT_Translation",
"NEW_9_TEXT_Translation",
"SIMPOC7_cl_10_TEXT_I1_Translate",
"SIMPOC7_cl_10_TEXT_I2_Translate",
"NEW_10_TEXT_Translation",
"P13_cl_O3_TEXT_I1_Translation",
"NEW_9_cl_TEXT_I1_Translation",
"NEW_9_cl_TEXT_I2_Translation",
"NEW_9_cl_TEXT_I3_Translation",
"P14_cl_O2_I1_TEXT_Translation",
"P13_cl_O2_TEXT_I2_Translation",
"SIMPOC7_cl_10_TEXT_I3_Translate",
"P14_cl_O1_I3_TEXT_Translation",
"P14_cl_O1_I2_TEXT_Translation",
"IDR3_13_TEXT_Translation",
"IDR3_15_TEXT_Translation",
"e3e_TEXT_Translation",
"E2_11_8_TEXT_Translation",
"E_14_7_TEXT_Translation")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata$E_14_7_TEXT_Translation[2380] <- "Respondent's bother was tricked in bad activities and later threatened to help [activity redacted]"
mydata$E_14_7_TEXT_Translation[3099] <- "In Q64, respondent said there was no income and later in Q307 respondent said [amount redacted] so entered the option more than 12,000 in Q307"
mydata$E_14_7_TEXT_Translation[3680] <- "GPS did not capture for about 20 minutes and started the interview without GPS. In Q64 respondent did not have any income but her/his son sent [amount redacted] the other day"
mydata$IDR3_13_TEXT_Translation[87] <- "[respondent name redacted] is dead"
mydata$NEW_10_TEXT_Translation[2792] <- "Shop [type redacted]"
mydata <- mydata[!names(mydata) %in% "SrvyrComment"]
# Setup map
countrymap <- map_data("world") %>% filter(region=="Nepal") #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")
# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement.
gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 52 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## Longitude Latitude
## Min. :84.31 Min. :26.85
## 1st Qu.:85.07 1st Qu.:27.56
## Median :85.47 Median :27.65
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.15 Max. :28.00
## NA's :52 NA's :52
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## Longitude Latitude
## Min. :84.28 Min. :26.83
## 1st Qu.:85.06 1st Qu.:27.55
## Median :85.46 Median :27.64
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.18 Max. :28.03
## NA's :52 NA's :52
## [1] "Processing time = 7.08120536406835"
gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 167 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:85.07 1st Qu.:27.56
## Median :85.47 Median :27.65
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.15 Max. :28.00
## NA's :167 NA's :167
## Warning: Removed 167 rows containing missing values (geom_point).
## Warning: Removed 167 rows containing missing values (geom_point).
## Warning: Removed 167 rows containing missing values (geom_point).
## Warning: Removed 167 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.28 Min. :26.82
## 1st Qu.:85.07 1st Qu.:27.55
## Median :85.46 Median :27.64
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.17 Max. :28.04
## NA's :167 NA's :167
## [1] "Processing time = 8.29655353625615"
gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 2258 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :2258 NA's :2258
## Warning: Removed 2258 rows containing missing values (geom_point).
## Warning: Removed 2258 rows containing missing values (geom_point).
## Warning: Removed 2258 rows containing missing values (geom_point).
## Warning: Removed 2258 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.28 Min. :26.83
## 1st Qu.:84.98 1st Qu.:27.55
## Median :85.45 Median :27.64
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.17 Max. :28.04
## NA's :2258 NA's :2258
## [1] "Processing time = 4.31941741704941"
gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 2235 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.32 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :2235 NA's :2235
## Warning: Removed 2235 rows containing missing values (geom_point).
## Warning: Removed 2235 rows containing missing values (geom_point).
## Warning: Removed 2235 rows containing missing values (geom_point).
## Warning: Removed 2235 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.29 Min. :26.83
## 1st Qu.:84.97 1st Qu.:27.54
## Median :85.44 Median :27.64
## Mean :85.32 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.17 Max. :28.04
## NA's :2235 NA's :2235
## [1] "Processing time = 4.47818704843521"
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))