rm(list=ls(all=t))
filename <- "Nepal Round 3_FinalRaw" # !!!Update filename
source ("functions_1.7.R")
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables
# !!!Include any Direct PII variables
dropvars <- c("SbjNam",
"FrScName",
"RvwName",
"IDR3_3",
"IDR3_18",
"IDR3_19",
"IDR3_23")
mydata <- mydata[!names(mydata) %in% dropvars]
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal ashish.shrestha bhanu.bhakta.dhakal
## 79 96 94 98 132 79
## dev.raj.nepal dhan.kumari.darlami dilip.joshi gita.maharjan gyanendra.parajuli kajiman.mahatara
## 240 86 217 103 217 236
## kamala.sharma mani.ram.dahal manjula.giri min.kumari.shrestha nabina.khadka niraj.shrestha
## 80 242 99 86 80 89
## prahlad.mainali pramila.shrestha pratika.shrestha rabischandra.bhatta ram.kumar.acharya sajina.shrestha
## 239 77 88 92 91 78
## sandip.shrestha sanjay.pokharel sapana.gautam sarita.shrestha sunil.shrestha tirtha.maya.rai
## 273 69 84 101 272 106
## upeksha yamuna.karki
## 1 87
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 79 96 94 98 132 79 240 86 217 103 217 236 80 242 99 86 80 89 239 77 88 92 91 78 273 69 84 101 272 106 1 87
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("IDR3_6_19",
"IDR3_6_22",
"IDR3_6_23",
"IDR3_6_24",
"IDR3_6_26",
"IDR3_6_30",
"IDR3_6_31",
"IDR3_6_35",
"IDR3_7")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa NP Netraganj Raniganj Sankarpur 999999
## 65 66 64 62 64 64 64 62 3500
## [1] "Frequency table after encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## 280 281 282 283 284 285 286 287 999999
## 62 65 66 64 64 64 62 64 3500
## [1] "Frequency table before encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## Bhimeswor NP Bocha Dandakharka Fasku Katakuti Lamidanda Melung Pawati 999999
## 61 39 62 62 63 62 64 64 3534
## [1] "Frequency table after encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## 904 905 907 908 909 910 911 912 999999
## 64 62 63 61 64 62 62 39 3534
## [1] "Frequency table before encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## Badegau Irkhu BhoteNamlang Talamarang Ichok Kadambas Langarche Melamchi 999999
## 64 66 66 65 63 60 62 64 3501
## [1] "Frequency table after encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## 876 877 879 880 881 882 883 884 999999
## 66 65 62 60 64 64 63 66 3501
## [1] "Frequency table before encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## Anaikot Baluwapati Deupur Chalal Ganeshthan Kalati Bhumidanda Mahankal Chaur Methinkot Patalekhet
## 63 62 67 62 61 62 64
## Raviopi 999999
## 62 3508
## [1] "Frequency table after encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## 513 514 515 516 518 519 520 521 999999
## 62 62 67 61 62 63 62 64 3508
## [1] "Frequency table before encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## Balkot Changunarayan Chitapol Duwakot Gundu Madhyapur Thimi NP Nankhel
## 59 63 64 63 62 70 62
## Sirutar 999999
## 58 3510
## [1] "Frequency table after encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## 689 690 691 692 693 694 695 697 999999
## 62 62 63 63 59 58 70 64 3510
## [1] "Frequency table before encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## Baireni Dhussa Khari Kiranchok Naubise Salyantar Sunaula Bazar Thakre 999999
## 62 64 62 63 64 63 62 64 3507
## [1] "Frequency table after encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## 406 407 408 409 410 411 412 413 999999
## 64 63 62 64 64 62 63 62 3507
## [1] "Frequency table before encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## Chitlang Churiyamai Fakhel Padampokhari Kulekhani Nibuwatar
## 61 63 63 65 62 61
## Shreepur Chhatiwan Sisneri Mahadevsthan 999999
## 62 63 3511
## [1] "Frequency table after encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## 635 636 637 638 639 640 641 642 999999
## 61 61 63 62 63 62 65 63 3511
## [1] "Frequency table before encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## Birendranagar Jutpani Kathar Khairahani Padampur Parbatipur Piple Shaktikhor 999999
## 63 64 63 62 61 62 65 66 3505
## [1] "Frequency table after encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## 797 798 799 800 801 802 804 805 999999
## 63 62 63 66 64 62 61 65 3505
## [1] "Frequency table before encoding"
## IDR3_7. Ward Number
## 1 2 3 4 5 6 7 8 9 10 11 12 14
## 504 542 349 494 322 545 393 300 429 25 32 38 38
## [1] "Frequency table after encoding"
## IDR3_7. Ward Number
## 505 506 507 508 509 510 511 512 513 514 515 516 517
## 393 25 38 494 349 504 322 38 545 300 542 429 32
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1,
"25-34" =2,
"35-44" =3,
"45-54" =4,
"55-64" =5,
"65 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
## 17 43 62 62 64 72 68 73 78 61 50 42 35 53 60 50 55 56 53 52 36 35 38 51 35 43 28 41 44 31 36 34 37 35 32 29 31 31 28 24 26 18 27 33 15 18
## 62 63 64 65 66 67 68 69
## 17 18 14 13 9 12 2 1
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 17 0 0 0 0 0 0
## 17 43 0 0 0 0 0 0
## 18 62 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 64 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 73 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 61 0 0 0 0 0
## 26 0 50 0 0 0 0 0
## 27 0 42 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 60 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 55 0 0 0 0 0
## 33 0 56 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 38 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 35 0 0 0 0
## 41 0 0 43 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 37 0 0 0
## 49 0 0 0 35 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 29 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 31 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 24 0 0
## 56 0 0 0 0 26 0 0
## 57 0 0 0 0 18 0 0
## 58 0 0 0 0 27 0 0
## 59 0 0 0 0 33 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 14 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## 999999 0 0 0 0 0 0 1983
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 539 515 403 324 210 37 1983
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older NA
## 1 2 3 4 5 6 7
# !!!Include relevant variables in list below
indirect_PII <- c("IDR3_20",
"D_9",
"HC2_O1",
"HC2_O2",
"HC2_O3",
"HC2_O4",
"HC2_O5",
"HC2_O6",
"HC3",
"HC4_1",
"HC4_2",
"HC4_3",
"HC4_4",
"D_4",
"Inc_17",
"MC_39x3_1b",
"MC_39x3_1d",
"Stigma1_2",
"HT_13x3_1x3",
"HT_13x3_4x3",
"HT_13x3_7x3",
"HT_13x3_13x3",
"LE20_1_r3",
"I_3_conjoint2_3_r3",
"G1_04",
"P3",
"P3A",
"P4",
"P4A",
"P8_O1",
"P8_O2",
"P8_O3",
"P12A",
"P13A_O1",
"P13A_O2",
"P9B",
"P10B",
"P12B",
"P13B_O1",
"P13B_O2",
"I_1_P9C",
"I_1_P10C",
"I_1_P11C",
"I_1_P11_A3",
"I_1_P12C",
"I_1_P13C_O1",
"I_2_P9C",
"I_2_P10C",
"I_2_P11_A3",
"I_2_P12C",
"I_3_P9C",
"I_3_P10C",
"I_3_P11_A3",
"I_3_P12C",
"I_3_P13C_O1",
"I_4_P9C",
"I_4_P10C",
"I_4_P11_A3",
"I_4_P12C",
"I_4_P13C_O1",
"I_5_P9C",
"I_5_P10C",
"I_5_P11_A3",
"I_5_P12C",
"I_5_P13C_O1",
"I_6_P9C",
"I_6_P10C",
"I_6_P11C",
"I_6_P11_A3",
"I_6_P12C",
"I_6_P13C_O1",
"I_7_P9C",
"I_7_P10C",
"I_7_P11_A3",
"I_7_P12C",
"I_7_P13C_O1",
"I_8_P9C",
"I_8_P10C",
"I_8_P11C",
"I_8_P11_A3",
"I_8_P12C",
"I_8_P13C_O1",
"I_1_P9D",
"I_1_P10D",
"I_1_P11D",
"I_1_P11_A4",
"I_1_P12D",
"I_1_P13D_O1",
"I_2_P9D",
"I_2_P10D",
"I_2_P11D",
"I_2_P11_A4",
"I_2_P12D",
"I_2_P13D_O1",
"I_1_P9E",
"I_1_P10E",
"I_1_P11E",
"I_1_P11_A5",
"I_1_P12E",
"I_1_P13E_O1",
"I_2_P9E",
"I_2_P10E",
"I_2_P11E",
"I_2_P11_A5",
"I_2_P12E",
"I_2_P13E_O1",
"I_3_P9E",
"I_3_P10E",
"I_3_P11E",
"I_3_P11_A5",
"I_3_P12E",
"I_3_P13E_O1",
"P20A",
"P18B",
"P19B",
"I_1_P18C",
"I_1_P19C",
"I_1_P20C",
"I_2_P18C",
"I_1_P18D",
"I_2_P18D",
"I_2_P19D",
"I_2_P20D",
"I_3_P18D",
"I_3_P19D",
"I_3_P20D",
"I_4_P18D",
"I_4_P19D",
"I_4_P20D",
"I_5_P18D",
"I_5_P19D",
"I_5_P20D",
"I_6_P18D",
"I_6_P19D",
"I_6_P20D",
"I_7_P18D",
"I_7_P19D",
"I_7_P20D",
"I_8_P18D",
"I_8_P19D",
"I_8_P20D",
"I_1_P18E",
"I_1_P19E",
"I_1_P20E",
"I_2_P18E",
"I_2_P19E",
"I_2_P20E",
"I_3_P18E",
"I_3_P19E",
"I_3_P20E",
"I_4_P18E",
"I_4_P19E",
"I_4_P20E",
"I_5_P18E",
"I_5_P19E",
"I_5_P20E",
"I_6_P18E",
"I_6_P19E",
"I_6_P20E",
"I_7_P18E",
"I_7_P19E",
"I_7_P20E",
"I_8_P18E",
"I_8_P19E",
"I_8_P20E",
"Child1",
"I_1_NEW_2_cl",
"I_1_P19_cl",
"I_1_D_9_cl",
"I_1_D_4_cl",
"I_2_NEW_2_cl",
"I_2_P19_cl",
"I_2_D_9_cl",
"I_2_D_4_cl",
"I_3_NEW_2_cl",
"I_3_P19_cl",
"I_3_D_9_cl",
"I_3_D_4_cl",
"I_4_NEW_2_cl",
"I_4_P19_cl",
"I_4_D_4_cl",
"I_5_NEW_2_cl",
"I_5_P19_cl",
"I_5_D_9_cl",
"I_5_D_4_cl",
"I_6_NEW_2_cl",
"I_6_P19_cl",
"I_6_D_4_cl",
"update6_1",
"update6_2",
"E2_2")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
mydata <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages.
## [1] "Frequency table before encoding"
## E2_2. What language did you use other than Nepali?
## MAITHILI NEWAR 999999
## 1 6 4004
## [1] "Frequency table after encoding"
## E2_2. What language did you use other than Nepali?
## 1 2 3
## 1 6 4004
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table before encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 6 7 8 9 13 888 999999
## 758 544 474 170 52 16 4 1 1 1 3 3 1984
## [1] "Frequency table after encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 or more 888 999999
## 758 544 474 170 52 26 3 1984
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata2 <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700 777 888 999 1000 1200 1500 1600 2000 2400 2500 2600
## 133 1 1 1 1 5 2 1 4 1 22 3 1 4 5 27 2 8 1
## 3000 3500 4000 4500 4800 5000 6000 6500 7000 8000 8500 9000 10000 11000 11500 12000 12846 13000 14000
## 33 2 42 3 1 101 35 1 38 20 1 24 193 3 1 54 1 11 10
## 14500 15000 15500 16000 17000 18000 19000 19135 20000 21000 22000 22500 23000 24000 25000 26000 27000 27500 28000
## 1 174 1 24 8 16 6 1 222 2 12 1 5 5 105 1 5 1 2
## 30000 32000 33000 34000 35000 36000 37000 40000 41000 45000 48000 50000 54000 55000 57000 60000 62000 65000 66000
## 177 2 1 1 54 4 1 90 3 17 1 115 2 4 2 48 1 7 1
## 67000 68000 70000 75000 79000 79500 80000 85000 95000 1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000
## 1 1 15 4 1 1 13 1 1 27 1 1 1 2 1 1 1 1 12
## 160000 170000 2e+05 240000 250000 3e+05 320000 5e+05 6e+05 7e+05 999999 1e+06
## 1 1 9 1 1 3 1 1 1 1 1984 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700 777
## 133 1 1 1 1 5 2 1 4
## 888 999 1000 1200 1500 1600 2000 2400 2500
## 1 22 3 1 4 5 27 2 8
## 2600 3000 3500 4000 4500 4800 5000 6000 6500
## 1 33 2 42 3 1 101 35 1
## 7000 8000 8500 9000 10000 11000 11500 12000 12846
## 38 20 1 24 193 3 1 54 1
## 13000 14000 14500 15000 15500 16000 17000 18000 19000
## 11 10 1 174 1 24 8 16 6
## 19135 20000 21000 22000 22500 23000 24000 25000 26000
## 1 222 2 12 1 5 5 105 1
## 27000 27500 28000 30000 32000 33000 34000 35000 36000
## 5 1 2 177 2 1 1 54 4
## 37000 40000 41000 45000 48000 50000 54000 55000 57000
## 1 90 3 17 1 115 2 4 2
## 60000 62000 65000 66000 67000 68000 70000 75000 79000
## 48 1 7 1 1 1 15 4 1
## 79500 80000 85000 95000 1e+05 103000 104000 110000 115000
## 1 13 1 1 27 1 1 1 2
## 117000 125000 130000 135000 150000 160000 170000 2e+05 or more 999999
## 1 1 1 1 12 1 1 19 1984
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20') ##!!! Replace with candidate categorical demo vars
# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4011 rows and 1162 variables.
## --> Categorical key variables: D_4, IDR3_20
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 18 (18) 222.833 (222.833) 1 (1)
## IDR3_20 7 (7) 573.000 (573.000) 37 (37)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 10 (0.249%)
## - 3-anonymity: 22 (0.548%)
## - 5-anonymity: 51 (1.272%)
##
## ----------------------------------------------------------------------
# Recode of education and age to reduce risk of re-identification
break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Does not apply"= 9,
"Don't Know"= 10,
"NA"= 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Pre-school/Kindergarten CLASS 1 CLASS 2
## 1 31 54
## CLASS 3 CLASS 4 CLASS 5
## 71 75 151
## CLASS 6 CLASS 7 CLASS 8
## 69 85 120
## CLASS 9 CLASS 10 SLC
## 84 104 298
## CLASS 12/Intermediate level Bachelor level Post-Secondary Level (e.g., MA, PhD)
## 266 62 9
## Literate, but never attended school Illiterate, and never attended school
## 305 242
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 0 1 0 0 0 0 0 0 0 0 0 0
## 1 31 0 0 0 0 0 0 0 0 0 0
## 2 54 0 0 0 0 0 0 0 0 0 0
## 3 71 0 0 0 0 0 0 0 0 0 0
## 4 75 0 0 0 0 0 0 0 0 0 0
## 5 151 0 0 0 0 0 0 0 0 0 0
## 6 0 69 0 0 0 0 0 0 0 0 0
## 7 0 85 0 0 0 0 0 0 0 0 0
## 8 0 120 0 0 0 0 0 0 0 0 0
## 9 0 0 84 0 0 0 0 0 0 0 0
## 10 0 0 104 0 0 0 0 0 0 0 0
## 11 0 0 0 298 0 0 0 0 0 0 0
## 12 0 0 0 0 266 0 0 0 0 0 0
## 13 0 0 0 0 0 62 0 0 0 0 0
## 14 0 0 0 0 0 9 0 0 0 0 0
## 15 0 0 0 0 0 0 305 0 0 0 0
## 16 0 0 0 0 0 0 0 242 0 0 0
## 999999 0 0 0 0 0 0 0 0 0 0 1984
## [1] "Frequency table after encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 383 274 188
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 298 266 71
## Literate, but never attended school Illiterate, and never attended school NA
## 305 242 1984
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## CLASS 1 CLASS 2 CLASS 3
## 15 48 48
## CLASS 4 CLASS 5 CLASS 6
## 68 115 57
## CLASS 7 CLASS 8 CLASS 9
## 77 114 69
## CLASS 10 SLC CLASS 12/Intermediate level
## 87 188 127
## Bachelor level Post-Secondary Level (e.g., MA, PhD) Literate, but never attended school
## 35 11 238
## Illiterate, and never attended school Does not apply Don't know
## 281 4 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 1 15 0 0 0 0 0 0 0 0 0 0
## 2 48 0 0 0 0 0 0 0 0 0 0
## 3 48 0 0 0 0 0 0 0 0 0 0
## 4 68 0 0 0 0 0 0 0 0 0 0
## 5 115 0 0 0 0 0 0 0 0 0 0
## 6 0 57 0 0 0 0 0 0 0 0 0
## 7 0 77 0 0 0 0 0 0 0 0 0
## 8 0 114 0 0 0 0 0 0 0 0 0
## 9 0 0 69 0 0 0 0 0 0 0 0
## 10 0 0 87 0 0 0 0 0 0 0 0
## 11 0 0 0 188 0 0 0 0 0 0 0
## 12 0 0 0 0 127 0 0 0 0 0 0
## 13 0 0 0 0 0 35 0 0 0 0 0
## 14 0 0 0 0 0 11 0 0 0 0 0
## 15 0 0 0 0 0 0 238 0 0 0 0
## 16 0 0 0 0 0 0 0 281 0 0 0
## 888 0 0 0 0 0 0 0 0 0 4 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 2426
## [1] "Frequency table after encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 294 248 156
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 188 127 46
## Literate, but never attended school Illiterate, and never attended school Don't Know
## 238 281 4
## NA
## 2429
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Pre-school/Kindergarten CLASS 1 CLASS 2
## 1 12 27
## CLASS 3 CLASS 4 CLASS 5
## 33 23 61
## CLASS 6 CLASS 7 CLASS 8
## 15 21 44
## CLASS 9 CLASS 10 SLC
## 22 26 48
## CLASS 12/Intermediate level Bachelor level Post-Secondary Level (e.g., MA, PhD)
## 29 4 2
## Literate, but never attended school Illiterate, and never attended school Refused to answer
## 121 111 1
## Does not apply Don't know
## 51 11
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 0 1 0 0 0 0 0 0 0 0 0 0
## 1 12 0 0 0 0 0 0 0 0 0 0
## 2 27 0 0 0 0 0 0 0 0 0 0
## 3 33 0 0 0 0 0 0 0 0 0 0
## 4 23 0 0 0 0 0 0 0 0 0 0
## 5 61 0 0 0 0 0 0 0 0 0 0
## 6 0 15 0 0 0 0 0 0 0 0 0
## 7 0 21 0 0 0 0 0 0 0 0 0
## 8 0 44 0 0 0 0 0 0 0 0 0
## 9 0 0 22 0 0 0 0 0 0 0 0
## 10 0 0 26 0 0 0 0 0 0 0 0
## 11 0 0 0 48 0 0 0 0 0 0 0
## 12 0 0 0 0 29 0 0 0 0 0 0
## 13 0 0 0 0 0 4 0 0 0 0 0
## 14 0 0 0 0 0 2 0 0 0 0 0
## 15 0 0 0 0 0 0 121 0 0 0 0
## 16 0 0 0 0 0 0 0 111 0 0 0
## 777 0 0 0 0 0 0 0 0 1 0 0
## 888 0 0 0 0 0 0 0 0 0 51 0
## 999 0 0 0 0 0 0 0 0 0 0 11
## 999999 0 0 0 0 0 0 0 0 0 0 3348
## [1] "Frequency table after encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 157 80 48
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 48 29 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 121 111 1
## Don't Know NA
## 51 3359
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## CLASS 1 CLASS 2 CLASS 3
## 3 13 10
## CLASS 4 CLASS 5 CLASS 6
## 21 24 13
## CLASS 7 CLASS 8 CLASS 9
## 6 20 5
## CLASS 10 SLC CLASS 12/Intermediate level
## 9 23 5
## Bachelor level Literate, but never attended school Illiterate, and never attended school
## 1 227 260
## Does not apply Don't know
## 20 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 1 3 0 0 0 0 0 0 0 0 0 0
## 2 13 0 0 0 0 0 0 0 0 0 0
## 3 10 0 0 0 0 0 0 0 0 0 0
## 4 21 0 0 0 0 0 0 0 0 0 0
## 5 24 0 0 0 0 0 0 0 0 0 0
## 6 0 13 0 0 0 0 0 0 0 0 0
## 7 0 6 0 0 0 0 0 0 0 0 0
## 8 0 20 0 0 0 0 0 0 0 0 0
## 9 0 0 5 0 0 0 0 0 0 0 0
## 10 0 0 9 0 0 0 0 0 0 0 0
## 11 0 0 0 23 0 0 0 0 0 0 0
## 12 0 0 0 0 5 0 0 0 0 0 0
## 13 0 0 0 0 0 1 0 0 0 0 0
## 15 0 0 0 0 0 0 227 0 0 0 0
## 16 0 0 0 0 0 0 0 260 0 0 0
## 888 0 0 0 0 0 0 0 0 0 20 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 3348
## [1] "Frequency table after encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 71 39 14
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 23 5 1
## Literate, but never attended school Illiterate, and never attended school Don't Know
## 227 260 20
## NA
## 3351
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## CLASS 2 CLASS 3 CLASS 4
## 2 2 1
## CLASS 5 CLASS 9 SLC
## 2 1 1
## CLASS 12/Intermediate level Literate, but never attended school Illiterate, and never attended school
## 2 29 56
## Don't know
## 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999) [999,1e+06)
## 2 2 0 0 0 0 0 0 0 0 0 0
## 3 2 0 0 0 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0 0 0 0 0
## 5 2 0 0 0 0 0 0 0 0 0 0
## 9 0 0 1 0 0 0 0 0 0 0 0
## 11 0 0 0 1 0 0 0 0 0 0 0
## 12 0 0 0 0 2 0 0 0 0 0 0
## 15 0 0 0 0 0 0 29 0 0 0 0
## 16 0 0 0 0 0 0 0 56 0 0 0
## 999 0 0 0 0 0 0 0 0 0 0 3
## 999999 0 0 0 0 0 0 0 0 0 0 3912
## [1] "Frequency table after encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## Primary or less (0-5) Secondary (9-10) SLC (11)
## 7 1 1
## CLASS 12/Intermediate level (12) Literate, but never attended school Illiterate, and never attended school
## 2 29 56
## NA
## 3915
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8) Secondary (9-10)
## 1 2 3
## SLC (11) CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 4 5 6
## Literate, but never attended school Illiterate, and never attended school Does not apply
## 7 8 9
## Don't Know NA
## 10 11
# Re-run to check 2-anonimity
selectedKeyVars = c('D_4', 'IDR3_20')
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4011 rows and 1162 variables.
## --> Categorical key variables: D_4, IDR3_20
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 9 (9) 445.667 (445.667) 71 (71)
## IDR3_20 7 (7) 573.000 (573.000) 37 (37)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 2 (0.050%)
## - 3-anonymity: 6 (0.150%)
## - 5-anonymity: 19 (0.474%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## # A tibble: 2 x 2
## D_4 IDR3_20
## <dbl+lbl> <dbl+lbl>
## 1 6 [Bachelor/Postgraduate level] 5 [55-64]
## 2 11 [NA] 4 [45-54]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## D_4 IDR3_20
## 829 NA 5
## 1078 NA 4
mydata [notAnon,"D_4"] <- 9
# !!! Identify open-end variables here:
open_ends <- c("I_1_P14D_12_TEXT",
"RvwComment",
"SrvyrComment",
"H2_12_TEXT",
"HTNx3_2_14_TEXT",
"HTV_1_10_TEXTx3",
"HTV_3_11_TEXTx3",
"CPR5i_TEXT",
"G1_00_08_TEXT",
"P13A_10_TEXT",
"P14A_12_TEXT",
"P13B_10_TEXT",
"P14B_12_TEXT",
"SIMPOC7B_10_TEXT",
"I_1_P13C_10_TEXT",
"I_1_P14C_12_TEXT",
"I_1_SIMPOC7C_10_TEXT",
"I_2_P14C_12_TEXT",
"I_1_P13D_10_TEXT",
"I_1_P14D_12_TEXT",
"I_2_P14D_12_TEXT",
"I_1_P13E_10_TEXT",
"I_1_P14E_12_TEXT",
"I_1_SIMPOC7E_10_TEXT",
"I_2_P14E_12_TEXT",
"I_3_P14E_12_TEXT",
"NEW_3_12_TEXT",
"NEW_9",
"I_1_Q_559_S",
"I_1_SIMPOC7_cl_10_TEXT",
"I_2_NEW_9_cl",
"I_2_SIMPOC7_cl_10_TEXT",
"I_3_NEW_9_cl",
"I_3_Q_559_S",
"I_3_SIMPOC7_cl_10_TEXT",
"e3e_TEXT",
"E2_11_8_TEXT",
"E_14_7_TEXT",
"L1_other_text",
"L2_other_text")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata <- mydata[!names(mydata) %in% "SrvyrComment"]
# Setup map
countrymap <- map_data("world") %>% filter(region=="Nepal") #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")
# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement.
gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 53 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## Longitude Latitude
## Min. :84.31 Min. :26.85
## 1st Qu.:85.07 1st Qu.:27.56
## Median :85.46 Median :27.65
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.15 Max. :28.00
## NA's :53 NA's :53
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 53 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## Longitude Latitude
## Min. :84.28 Min. :26.84
## 1st Qu.:85.06 1st Qu.:27.55
## Median :85.46 Median :27.64
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.19 Max. :28.04
## NA's :53 NA's :53
## [1] "Processing time = 9.0734845995903"
gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 170 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:85.07 1st Qu.:27.56
## Median :85.47 Median :27.65
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.15 Max. :28.00
## NA's :170 NA's :170
## Warning: Removed 170 rows containing missing values (geom_point).
## Warning: Removed 170 rows containing missing values (geom_point).
## Warning: Removed 170 rows containing missing values (geom_point).
## Warning: Removed 170 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.28 Min. :26.82
## 1st Qu.:85.07 1st Qu.:27.55
## Median :85.46 Median :27.64
## Mean :85.35 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.73
## Max. :86.18 Max. :28.03
## NA's :170 NA's :170
## [1] "Processing time = 10.2460972825686"
gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 2272 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :2272 NA's :2272
## Warning: Removed 2272 rows containing missing values (geom_point).
## Warning: Removed 2272 rows containing missing values (geom_point).
## Warning: Removed 2272 rows containing missing values (geom_point).
## Warning: Removed 2272 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.30 Min. :26.85
## 1st Qu.:84.98 1st Qu.:27.55
## Median :85.45 Median :27.63
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.18 Max. :28.03
## NA's :2272 NA's :2272
## [1] "Processing time = 4.04510399897893"
gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 2249 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.32 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :2249 NA's :2249
## Warning: Removed 2249 rows containing missing values (geom_point).
## Warning: Removed 2249 rows containing missing values (geom_point).
## Warning: Removed 2249 rows containing missing values (geom_point).
## Warning: Removed 2249 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.28 Min. :26.83
## 1st Qu.:84.97 1st Qu.:27.55
## Median :85.44 Median :27.64
## Mean :85.32 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.19 Max. :28.04
## NA's :2249 NA's :2249
## [1] "Processing time = 3.81436225175858"
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
haven_table("update6_1")
## update6_1. Regardless of if you have been a victim of physical and/or sexual violence from
## Extremely familiar Very familiar Somewhat familiar A little familiar Not at all familiar Refused to answer
## 121 383 383 107 55 1
## Don't know 999999
## 1 2960