rm(list=ls(all=t))
filename <- "Nepal Round 3_FinalClean_Labor" # !!!Update filename
source ("functions_1.7.R")
## --------
## This is sdcMicro v5.6.0.
## For references, please have a look at citation('sdcMicro')
## Note: since version 5.0.0, the graphical user-interface is a shiny-app that can be started with sdcApp().
## Please submit suggestions and bugs at: https://github.com/sdcTools/sdcMicro/issues
## --------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: sp
## Checking rgeos availability: TRUE
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:sdcMicro':
##
## freq
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## Loading required package: spatstat.data
## Loading required package: spatstat.geom
## spatstat.geom 2.0-1
##
## Attaching package: 'spatstat.geom'
## The following objects are masked from 'package:raster':
##
## area, rotate, shift
## Loading required package: spatstat.core
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:raster':
##
## getData
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: rpart
## spatstat.core 2.0-0
## Loading required package: spatstat.linnet
## spatstat.linnet 2.1-1
##
## spatstat 2.0-1 (nickname: 'Caution: contains small parts')
## For an introduction to spatstat, type 'beginner'
## rgeos version: 0.5-5, (SVN revision 640)
## GEOS runtime version: 3.8.0-CAPI-1.13.1
## Linking to sp version: 1.4-4
## Polygon checking: TRUE
##
## Spatial Point Pattern Analysis Code in S-Plus
##
## Version 2 - Spatial and Space-Time analysis
##
## Attaching package: 'splancs'
## The following object is masked from 'package:raster':
##
## zoom
## The following object is masked from 'package:dplyr':
##
## tribble
## Loading required package: spam
## Loading required package: dotCall64
## Loading required package: grid
## Spam version 2.6-0 (2020-12-14) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## See https://github.com/NCAR/Fields for
## an extensive vignette, other supplements and source code
##
## Attaching package: 'geosphere'
## The following object is masked from 'package:spatstat.geom':
##
## perimeter
##
## Attaching package: 'tibble'
## The following object is masked from 'package:splancs':
##
## tribble
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables
# !!!Include any Direct PII variables
dropvars <- c("RvwName", "IDR3_18", "IDR3_19", "LE_reportedby", "flag_reportedby")
mydata <- mydata[!names(mydata) %in% dropvars]
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables=c("Srvyr", "surveyor"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## alka.adhikari ambir.raj.kulung amrita.roka anjana.kumari.dulal
## 79 96 89 98
## ashish.shrestha bhanu.bhakta.dhakal dev.raj.nepal dhan.kumari.darlami
## 82 77 2 85
## gita.maharjan kamala.sharma manjula.giri min.kumari.shrestha
## 99 78 99 86
## nabina.khadka niraj.shrestha pramila.shrestha pratika.shrestha
## 80 85 77 85
## rabischandra.bhatta ram.kumar.acharya sajina.shrestha sandip.shrestha
## 88 88 73 96
## sapana.gautam sarita.shrestha tirtha.maya.rai yamuna.karki
## 80 99 104 86
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 79 96 89 98 82 77 2 85 99 78 99 86 80 85 77 85 88 88 73 96 80
## 22 23 24
## 99 104 86
## [1] "Frequency table before encoding"
## surveyor. Surveyor
## alka.adhikari ambir.raj.kulung amrita.roka
## 1 79 96 89
## anjana.kumari.dulal ashish.shrestha bhanu.bhakta.dhakal dev.raj.nepal
## 98 82 77 2
## dhan.kumari.darlami gita.maharjan kamala.sharma manjula.giri
## 85 99 78 99
## min.kumari.shrestha nabina.khadka niraj.shrestha pramila.shrestha
## 86 80 85 77
## pratika.shrestha rabischandra.bhatta ram.kumar.acharya sajina.shrestha
## 85 87 88 73
## sandip.shrestha sapana.gautam sarita.shrestha tirtha.maya.rai
## 96 80 99 104
## yamuna.karki
## 86
## [1] "Frequency table after encoding"
## surveyor. Surveyor
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 1 79 96 89 98 82 77 2 85 99 78 99 86 80 85 77 85 87 88 73 96
## 22 23 24 25
## 80 99 104 86
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("vdc", "IDR3_6_19","IDR3_6_22","IDR3_6_23","IDR3_6_24","IDR3_6_26","IDR3_6_30","IDR3_6_31","IDR3_6_35", "IDR3_7")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## vdc. VDC code
## Barahathawa Dhungrekhola Lalbandi Malangawa NP
## 38 29 61 27
## Netraganj Raniganj Sankarpur Bhimeswor NP
## 34 37 28 32
## Bocha Dandakharka Fasku Katakuti
## 17 32 31 31
## Lamidanda Melung Pawati Badegau
## 29 34 27 24
## Irkhu BhoteNamlang Talamarang Ichok
## 33 32 28 32
## Kadambas Langarche Melamchi Anaikot
## 25 32 34 25
## Baluwapati Deupur Chalal Ganeshthan Kalati Bhumidanda Mahankal Chaur
## 21 41 41 26
## Methinkot Patalekhet Raviopi Balkot
## 35 25 42 22
## Changunarayan Chitapol Duwakot Gundu
## 35 34 27 33
## Madhyapur Thimi NP Nankhel Sirutar Baireni
## 29 33 36 36
## Dhussa Khari Kiranchok Naubise
## 31 32 34 27
## Salyantar Sunaula Bazar Thakre Chitlang
## 35 33 24 27
## Churiyamai Fakhel Padampokhari Kulekhani
## 28 30 39 37
## Nibuwatar Shreepur Chhatiwan Sisneri Mahadevsthan Birendranagar
## 32 37 29 18
## Jutpani Kathar Khairahani Padampur
## 39 37 32 38
## Parbatipur Piple Shaktikhor
## 31 37 36
## [1] "Frequency table after encoding"
## vdc. VDC code
## 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
## 28 32 34 28 36 38 31 28 27 32 25 39 41 33 22 37 33 27 29 25 31
## 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
## 21 29 41 37 34 27 34 32 27 34 34 32 37 27 36 24 42 31 30 39 33
## 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
## 32 37 35 25 32 32 32 31 29 36 18 26 35 35 32 38 37 17 33 24 29
## 530
## 29
## [1] "Frequency table before encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa NP Netraganj
## 38 29 29 32 27 34
## Raniganj Sankarpur 999999
## 37 28 1757
## [1] "Frequency table after encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
## 904 905 906 907 908 909 910 912 999999
## 34 27 37 28 29 38 29 32 1757
## [1] "Frequency table before encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## Bhimeswor NP Bocha Dandakharka Fasku Katakuti Lamidanda
## 32 17 32 31 31 29
## Melung Pawati 999999
## 34 27 1778
## [1] "Frequency table after encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## 876 877 878 879 880 881 883 884 999999
## 27 31 29 34 32 32 17 31 1778
## [1] "Frequency table before encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## Badegau Irkhu BhoteNamlang Talamarang Ichok Kadambas
## 24 33 32 28 32 25
## Langarche Melamchi 999999
## 32 34 1771
## [1] "Frequency table after encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
## 513 514 515 516 517 518 520 521 999999
## 33 34 32 32 25 32 28 24 1771
## [1] "Frequency table before encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## Anaikot Baluwapati Deupur Chalal Ganeshthan Kalati Bhumidanda
## 25 21 41 41
## Mahankal Chaur Methinkot Patalekhet Raviopi
## 26 35 25 42
## 999999
## 1755
## [1] "Frequency table after encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
## 689 690 691 692 693 694 695 697 999999
## 21 42 25 41 25 35 26 41 1755
## [1] "Frequency table before encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## Balkot Changunarayan Chitapol Duwakot
## 22 35 34 27
## Gundu Madhyapur Thimi NP Nankhel Sirutar
## 33 29 33 36
## 999999
## 1762
## [1] "Frequency table after encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
## 405 406 407 408 410 411 412 413 999999
## 34 35 33 22 33 27 36 29 1762
## [1] "Frequency table before encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## Baireni Dhussa Khari Kiranchok Naubise Salyantar
## 36 31 32 34 27 35
## Sunaula Bazar Thakre 999999
## 33 24 1759
## [1] "Frequency table after encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
## 634 635 636 637 639 640 641 642 999999
## 32 34 36 31 33 35 27 24 1759
## [1] "Frequency table before encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## Chitlang Churiyamai Fakhel Padampokhari
## 27 28 30 39
## Kulekhani Nibuwatar Shreepur Chhatiwan Sisneri Mahadevsthan
## 37 32 37 29
## 999999
## 1752
## [1] "Frequency table after encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
## 798 799 800 801 802 803 804 805 999999
## 29 30 28 37 37 39 32 27 1752
## [1] "Frequency table before encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## Birendranagar Jutpani Kathar Khairahani Padampur Parbatipur
## 18 39 37 32 38 31
## Piple Shaktikhor 999999
## 37 36 1743
## [1] "Frequency table after encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## 875 876 877 878 879 880 881 882 999999
## 39 37 18 31 37 38 32 36 1743
## [1] "Frequency table before encoding"
## IDR3_7. Ward Number
## 1 2 3 4 5 6 7 8 9 10 11 12 14
## 263 246 169 237 162 290 209 155 218 11 19 22 10
## [1] "Frequency table after encoding"
## IDR3_7. Ward Number
## 533 534 535 536 537 538 539 540 541 542 543 544 545
## 209 19 169 237 246 290 218 263 155 162 10 22 11
# Focus on variables with a "Lowest Freq" of 10 or less.
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1,
"25-34" =2,
"35-44" =3,
"45-54" =4,
"55-64" =5,
"65 and older" =6,
"NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## 44 31 36 34 37 34 32 28 31 29 28 23 27 19 25 32 15 18 17 18 13 13 9 12 2 1
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 18 0 0 0 0 0 0
## 17 44 0 0 0 0 0 0
## 18 60 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 63 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 72 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 60 0 0 0 0 0
## 26 0 49 0 0 0 0 0
## 27 0 41 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 59 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 56 0 0 0 0 0
## 33 0 54 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 37 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 34 0 0 0 0
## 41 0 0 42 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 37 0 0 0
## 49 0 0 0 34 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 28 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 29 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 23 0 0
## 56 0 0 0 0 27 0 0
## 57 0 0 0 0 19 0 0
## 58 0 0 0 0 25 0 0
## 59 0 0 0 0 32 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 13 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older
## 537 510 400 320 207 37
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older
## 1 2 3 4 5 6
## NA
## 7
mydata <- ordinal_recode (variable="age", break_points=break_age, missing=999999, value_labels=labels_age)
## [1] "Frequency table before encoding"
## age. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## 44 31 36 34 37 34 32 28 31 29 28 23 27 19 25 32 15 18 17 18 13 13 9 12 2 1
## recoded
## [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
## 16 18 0 0 0 0 0 0
## 17 44 0 0 0 0 0 0
## 18 60 0 0 0 0 0 0
## 19 62 0 0 0 0 0 0
## 20 63 0 0 0 0 0 0
## 21 72 0 0 0 0 0 0
## 22 68 0 0 0 0 0 0
## 23 72 0 0 0 0 0 0
## 24 78 0 0 0 0 0 0
## 25 0 60 0 0 0 0 0
## 26 0 49 0 0 0 0 0
## 27 0 41 0 0 0 0 0
## 28 0 35 0 0 0 0 0
## 29 0 53 0 0 0 0 0
## 30 0 59 0 0 0 0 0
## 31 0 50 0 0 0 0 0
## 32 0 56 0 0 0 0 0
## 33 0 54 0 0 0 0 0
## 34 0 53 0 0 0 0 0
## 35 0 0 52 0 0 0 0
## 36 0 0 36 0 0 0 0
## 37 0 0 35 0 0 0 0
## 38 0 0 37 0 0 0 0
## 39 0 0 51 0 0 0 0
## 40 0 0 34 0 0 0 0
## 41 0 0 42 0 0 0 0
## 42 0 0 28 0 0 0 0
## 43 0 0 41 0 0 0 0
## 44 0 0 44 0 0 0 0
## 45 0 0 0 31 0 0 0
## 46 0 0 0 36 0 0 0
## 47 0 0 0 34 0 0 0
## 48 0 0 0 37 0 0 0
## 49 0 0 0 34 0 0 0
## 50 0 0 0 32 0 0 0
## 51 0 0 0 28 0 0 0
## 52 0 0 0 31 0 0 0
## 53 0 0 0 29 0 0 0
## 54 0 0 0 28 0 0 0
## 55 0 0 0 0 23 0 0
## 56 0 0 0 0 27 0 0
## 57 0 0 0 0 19 0 0
## 58 0 0 0 0 25 0 0
## 59 0 0 0 0 32 0 0
## 60 0 0 0 0 15 0 0
## 61 0 0 0 0 18 0 0
## 62 0 0 0 0 17 0 0
## 63 0 0 0 0 18 0 0
## 64 0 0 0 0 13 0 0
## 65 0 0 0 0 0 13 0
## 66 0 0 0 0 0 9 0
## 67 0 0 0 0 0 12 0
## 68 0 0 0 0 0 2 0
## 69 0 0 0 0 0 1 0
## [1] "Frequency table after encoding"
## age. How old are you?
## 15-24 25-34 35-44 45-54 55-64 65 and older
## 537 510 400 320 207 37
## [1] "Inspect value labels and relabel as necessary"
## 15-24 25-34 35-44 45-54 55-64 65 and older
## 1 2 3 4 5 6
## NA
## 7
# !!!Include relevant variables in list below
indirect_PII <- c("D_9",
"HC2_O1",
"HC2_O2",
"HC2_O3",
"HC2_O4",
"HC2_O5",
"HC2_O6",
"H2_12_TEXT",
"HC3",
"HC4_1",
"HC4_2",
"HC4_3",
"HC4_4",
"D_4",
"Inc_17",
"con1A_gender_I1",
"con1A_age_I1",
"con1A_caste_I1",
"con1A_crime_I1",
"con1A_income_I1",
"con1A_educ_I1",
"con1B_gender_I1",
"con1B_age_I1",
"con1B_caste_I1",
"con1B_crime_I1",
"con1B_income_I1",
"con1B_educ_I1",
"T_233_1_I1",
"con1A_gender_I2",
"con1A_age_I2",
"con1A_caste_I2",
"con1A_income_I2",
"con1A_educ_I2",
"con1B_gender_I2",
"con1B_age_I2",
"con1B_caste_I2",
"con1B_income_I2",
"con1B_educ_I2",
"con1A_gender_I3",
"con1A_age_I3",
"con1A_caste_I3",
"con1A_income_I3",
"con1A_educ_I3",
"con1B_gender_I3",
"con1B_age_I3",
"con1B_caste_I3",
"con1B_income_I3",
"con1B_educ_I3",
"con2A_gender_I1",
"con2A_age_I1",
"con2A_caste_I1",
"con2A_income_I1",
"con2A_educ_I1",
"con2B_gender_I1",
"con2B_age_I1",
"con2B_caste_I1",
"con2B_income_I1",
"con2B_educ_I1",
"con2A_gender_I2",
"con2A_age_I2",
"con2A_caste_I2",
"con2A_income_I2",
"con2A_educ_I2",
"con2B_gender_I2",
"con2B_age_I2",
"con2B_caste_I2",
"con2B_income_I2",
"con2B_educ_I2",
"con2A_gender_I3",
"con2A_age_I3",
"con2A_caste_I3",
"con2A_income_I3",
"con2A_educ_I3",
"con2B_gender_I3",
"con2B_age_I3",
"con2B_caste_I3",
"con2B_income_I3",
"con2B_educ_I3",
"P1",
"P1A",
"P2",
"P2A",
"P3",
"P3A",
"P4",
"P4A",
"P8_O1",
"P8_O2",
"P8_O3",
"P8_3_number",
"P8_4_number",
"P8_5_number",
"P12A",
"P12A_TEXT",
"P13A_O1",
"P13A_O2",
"P13A_10_TEXT",
"P9B",
"P10B",
"P12B",
"P13B_O1",
"P13B_O2",
"P13B_10_TEXT",
"P9C_I1",
"P10C_I1",
"P11C_I1",
"P11_A3_I1",
"P12C_I1",
"P12C_TEXT_I1",
"P13C_O1_I1",
"P13C_10_TEXT_I1",
"P9C_I2",
"P10C_I2",
"P11C_I2",
"P11_A3_I2",
"P12C_I2",
"P9D_I1",
"P10D_I1",
"P11D_I1",
"P11_A4_I1",
"P12D_I1",
"P13D_O1_I1",
"P13D_10_TEXT_I1",
"P9D_I2",
"P10D_I2",
"P11D_I2",
"P11_A4_I2",
"P12D_I2",
"P13D_O1_I2",
"P13D_O2_I2",
"P9E_I1",
"P10E_I1",
"P11E_I1",
"P11_A5_I1",
"P12E_I1",
"P13E_O1_I1",
"P13E_O2_I1",
"P9E_I2",
"P10E_I2",
"P11E_I2",
"P11_A5_I2",
"P12E_I2",
"P13E_O1_I2",
"P14E_O1_I2",
"P9E_I3",
"P10E_I3",
"P11E_I3",
"P11_A5_I3",
"P12E_I3",
"P13E_O1_I3",
"P14E_O1_I3",
"P20A",
"P19B",
"P18C_I1",
"P19C_I1",
"P20C_I1",
"P18C_I2",
"P19C_I2",
"P20C_I2",
"P18D_I1",
"P19D_I1",
"P20D_I1",
"P18D_I2",
"P19D_I2",
"P20D_I2",
"P18E_I1",
"P19E_I1",
"P20E_I1",
"NEW_2",
"NEW_2_cl_I1",
"P19_cl_I1",
"D_9_cl_I1",
"D_4_cl_I1",
"NEW_2_cl_I2",
"P19_cl_I2",
"D_9_cl_I2",
"D_4_cl_I2",
"NEW_2_cl_I3",
"P19_cl_I3",
"D_9_cl_I3",
"D_4_cl_I3",
"NEW_2_cl_I4",
"P19_cl_I4",
"D_9_cl_I4",
"D_4_cl_I4",
"NEW_2_cl_I5",
"P19_cl_I5",
"D_9_cl_I5",
"D_4_cl_I5",
"NEW_2_cl_I6",
"P19_cl_I6",
"D_4_cl_I6",
"D_8_cl_I6",
"E2_2",
"child_int",
"forcedmarriage",
"FM_self",
"FM_spouse",
"FM_child",
"FM_childnum",
"FM_parent",
"FM_parentnum",
"FM_sib",
"FM_sibnum",
"FM_self_aschild",
"FM_spouse_aschild",
"FM_child_aschild1",
"FM_child_aschild2",
"FM_parent_aschild1",
"FM_parent_aschild2",
"FM_sib_aschild1",
"income",
"incq",
"noeduc",
"rchild_int1",
"rchild_int2",
"rchild_int3",
"rchild_int4",
"rchild_int5",
"rchild_int6",
"age_rc1",
"age_rc2",
"age_rc3",
"age_rc4",
"age_rc5",
"age_rc6",
"menace_rc1",
"menace_rc2",
"noeduc_rc",
"allchild_int_hh",
"rchild_int_hh",
"agegroup_rc1",
"agegroup_rc2",
"agegroup_rc3",
"fiveyears_rcnum",
"incqb_rc1",
"incqb_rc2",
"incqb_rc3",
"incqb_rc4",
"incqb_rc5",
"incqb_rc6",
"incqb_rcnum",
"hhnoeduc_rc1",
"hhnoeduc_rc2",
"hhnoeduc_rc3",
"hhnoeduc_rc4",
"hhnoeduc_rc5",
"hhnoeduc_rcnum")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
mydata <- mydata[!names(mydata) %in% "H2_12_TEXT"] # Drop as actually verbatim data in Nepali
# P3 - Number of siblings, topcode cases with 10 or more than 10 siblings.
mydata2 <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages.
## [1] "Frequency table before encoding"
## E2_2. What language did you use other than Nepali?
## NEWAR 999999
## 6 2005
## [1] "Frequency table after encoding"
## E2_2. What language did you use other than Nepali?
## 1 2
## 6 2005
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members.
## [1] "Frequency table before encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 6 7 8 9 13 888
## 751 542 470 168 52 16 4 1 1 1 2 3
## [1] "Frequency table after encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
## 0 1 2 3 4 5 or more 888
## 751 542 470 168 52 25 3
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500 600 700 777 888 999 1000
## 132 1 1 1 1 5 2 1 4 1 22 3
## 1200 1500 1600 2000 2400 2500 2600 3000 3500 4000 4500 4800
## 1 4 5 26 2 8 1 33 2 42 3 1
## 5000 6000 6500 7000 8000 8500 9000 10000 11000 12000 12846 13000
## 99 34 1 38 20 1 24 193 3 54 1 11
## 14000 14500 15000 15500 16000 17000 18000 19000 19135 20000 21000 22000
## 10 1 174 1 23 8 15 6 1 220 2 12
## 22500 23000 24000 25000 26000 27000 27500 28000 30000 32000 33000 34000
## 1 4 5 105 1 5 1 2 177 2 1 1
## 35000 36000 37000 40000 41000 45000 48000 50000 54000 55000 57000 60000
## 54 4 1 88 3 17 1 113 2 4 2 48
## 62000 65000 66000 67000 68000 70000 75000 79000 79500 80000 85000 95000
## 1 7 1 1 1 15 4 1 1 13 1 1
## 1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000 160000 170000
## 27 1 1 1 2 1 1 1 1 11 1 1
## 2e+05 240000 250000 3e+05 320000 5e+05 6e+05 7e+05 1e+06
## 9 1 1 3 1 1 1 1 1
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
## 0 5 50 60 400 500
## 132 1 1 1 1 5
## 600 700 777 888 999 1000
## 2 1 4 1 22 3
## 1200 1500 1600 2000 2400 2500
## 1 4 5 26 2 8
## 2600 3000 3500 4000 4500 4800
## 1 33 2 42 3 1
## 5000 6000 6500 7000 8000 8500
## 99 34 1 38 20 1
## 9000 10000 11000 12000 12846 13000
## 24 193 3 54 1 11
## 14000 14500 15000 15500 16000 17000
## 10 1 174 1 23 8
## 18000 19000 19135 20000 21000 22000
## 15 6 1 220 2 12
## 22500 23000 24000 25000 26000 27000
## 1 4 5 105 1 5
## 27500 28000 30000 32000 33000 34000
## 1 2 177 2 1 1
## 35000 36000 37000 40000 41000 45000
## 54 4 1 88 3 17
## 48000 50000 54000 55000 57000 60000
## 1 113 2 4 2 48
## 62000 65000 66000 67000 68000 70000
## 1 7 1 1 1 15
## 75000 79000 79500 80000 85000 95000
## 4 1 1 13 1 1
## 1e+05 103000 104000 110000 115000 117000
## 27 1 1 1 2 1
## 125000 130000 135000 150000 160000 170000
## 1 1 1 11 1 1
## 2e+05 or more
## 19
#percentile_99.5 <- floor(quantile(mydata$income[mydata$income!=999999], probs = c(0.995), na.rm=TRUE))
mydata2 <- top_recode (variable="income", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## income.
## 0 5 50 60 400 500 600 700 1000 1200 1500 1600
## 132 1 1 1 1 5 2 1 3 1 4 5
## 2000 2400 2500 2600 3000 3500 4000 4500 4800 5000 6000 6500
## 26 2 8 1 33 2 42 3 1 99 34 1
## 7000 8000 8500 9000 10000 11000 12000 12846 13000 14000 14500 15000
## 38 20 1 24 193 3 54 1 11 10 1 174
## 15500 16000 17000 18000 19000 19135 20000 21000 22000 22500 23000 24000
## 1 23 8 15 6 1 220 2 12 1 4 5
## 25000 26000 27000 27500 28000 30000 32000 33000 34000 35000 36000 37000
## 105 1 5 1 2 177 2 1 1 54 4 1
## 40000 41000 45000 48000 50000 54000 55000 57000 60000 62000 65000 66000
## 88 3 17 1 113 2 4 2 48 1 7 1
## 67000 68000 70000 75000 79000 79500 80000 85000 95000 1e+05 103000 104000
## 1 1 15 4 1 1 13 1 1 27 1 1
## 110000 115000 117000 125000 130000 135000 150000 160000 170000 2e+05 240000 250000
## 1 2 1 1 1 1 11 1 1 9 1 1
## 3e+05 320000 5e+05 6e+05 7e+05 1e+06 <NA>
## 3 1 1 1 1 1 27
## [1] "Frequency table after encoding"
## income. 2e+05
## 0 5 50 60 400 500
## 132 1 1 1 1 5
## 600 700 1000 1200 1500 1600
## 2 1 3 1 4 5
## 2000 2400 2500 2600 3000 3500
## 26 2 8 1 33 2
## 4000 4500 4800 5000 6000 6500
## 42 3 1 99 34 1
## 7000 8000 8500 9000 10000 11000
## 38 20 1 24 193 3
## 12000 12846 13000 14000 14500 15000
## 54 1 11 10 1 174
## 15500 16000 17000 18000 19000 19135
## 1 23 8 15 6 1
## 20000 21000 22000 22500 23000 24000
## 220 2 12 1 4 5
## 25000 26000 27000 27500 28000 30000
## 105 1 5 1 2 177
## 32000 33000 34000 35000 36000 37000
## 2 1 1 54 4 1
## 40000 41000 45000 48000 50000 54000
## 88 3 17 1 113 2
## 55000 57000 60000 62000 65000 66000
## 4 2 48 1 7 1
## 67000 68000 70000 75000 79000 79500
## 1 1 15 4 1 1
## 80000 85000 95000 1e+05 103000 104000
## 13 1 1 27 1 1
## 110000 115000 117000 125000 130000 135000
## 1 2 1 1 1 1
## 150000 160000 170000 2e+05 or more <NA>
## 11 1 1 19 27
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20', 'con1A_gender_I1') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 1400 variables.
## --> Categorical key variables: D_4, IDR3_20, con1A_gender_I1
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 17 (17) 118.294 (118.294) 1
## IDR3_20 6 (6) 335.167 (335.167) 37
## con1A_gender_I1 2 (2) 1005.500 (1005.500) 997
##
## (1)
## (37)
## (997)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 25 (1.243%)
## - 3-anonymity: 57 (2.834%)
## - 5-anonymity: 138 (6.862%)
##
## ----------------------------------------------------------------------
# Recode of education and age to reduce risk of re-identification
break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Does not apply"= 9,
"Don't Know"= 10,
"NA"= 11)
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## CLASS 1 CLASS 2
## 15 48
## CLASS 3 CLASS 4
## 48 68
## CLASS 5 CLASS 6
## 114 57
## CLASS 7 CLASS 8
## 77 114
## CLASS 9 CLASS 10
## 69 86
## SLC CLASS 12/Intermediate level
## 188 126
## Bachelor level Post-Secondary Level (e.g., MA, PhD)
## 34 11
## Literate, but never attended school Illiterate, and never attended school
## 233 277
## Does not apply Don't know
## 4 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
## 1 15 0 0 0 0 0 0 0 0
## 2 48 0 0 0 0 0 0 0 0
## 3 48 0 0 0 0 0 0 0 0
## 4 68 0 0 0 0 0 0 0 0
## 5 114 0 0 0 0 0 0 0 0
## 6 0 57 0 0 0 0 0 0 0
## 7 0 77 0 0 0 0 0 0 0
## 8 0 114 0 0 0 0 0 0 0
## 9 0 0 69 0 0 0 0 0 0
## 10 0 0 86 0 0 0 0 0 0
## 11 0 0 0 188 0 0 0 0 0
## 12 0 0 0 0 126 0 0 0 0
## 13 0 0 0 0 0 34 0 0 0
## 14 0 0 0 0 0 11 0 0 0
## 15 0 0 0 0 0 0 233 0 0
## 16 0 0 0 0 0 0 0 277 0
## 888 0 0 0 0 0 0 0 0 0
## 999 0 0 0 0 0 0 0 0 0
## 999999 0 0 0 0 0 0 0 0 0
## recoded
## [888,999) [999,1e+06)
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 0 0
## 16 0 0
## 888 4 0
## 999 0 3
## 999999 0 439
## [1] "Frequency table after encoding"
## HC4_1. What is the highest completed education level of your spouse? [You do not need
## Primary or less (0-5) Lower secondary (6-8)
## 293 248
## Secondary (9-10) SLC (11)
## 155 188
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 126 45
## Literate, but never attended school Illiterate, and never attended school
## 233 277
## Don't Know NA
## 4 442
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Does not apply Don't Know
## 9 10
## NA
## 11
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Pre-school/Kindergarten CLASS 1
## 1 12
## CLASS 2 CLASS 3
## 27 31
## CLASS 4 CLASS 5
## 23 61
## CLASS 6 CLASS 7
## 15 21
## CLASS 8 CLASS 9
## 44 22
## CLASS 10 SLC
## 26 47
## CLASS 12/Intermediate level Bachelor level
## 29 4
## Post-Secondary Level (e.g., MA, PhD) Literate, but never attended school
## 2 121
## Illiterate, and never attended school Refused to answer
## 111 1
## Does not apply Don't know
## 51 11
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
## 0 1 0 0 0 0 0 0 0 0
## 1 12 0 0 0 0 0 0 0 0
## 2 27 0 0 0 0 0 0 0 0
## 3 31 0 0 0 0 0 0 0 0
## 4 23 0 0 0 0 0 0 0 0
## 5 61 0 0 0 0 0 0 0 0
## 6 0 15 0 0 0 0 0 0 0
## 7 0 21 0 0 0 0 0 0 0
## 8 0 44 0 0 0 0 0 0 0
## 9 0 0 22 0 0 0 0 0 0
## 10 0 0 26 0 0 0 0 0 0
## 11 0 0 0 47 0 0 0 0 0
## 12 0 0 0 0 29 0 0 0 0
## 13 0 0 0 0 0 4 0 0 0
## 14 0 0 0 0 0 2 0 0 0
## 15 0 0 0 0 0 0 121 0 0
## 16 0 0 0 0 0 0 0 111 0
## 777 0 0 0 0 0 0 0 0 1
## 888 0 0 0 0 0 0 0 0 0
## 999 0 0 0 0 0 0 0 0 0
## 999999 0 0 0 0 0 0 0 0 0
## recoded
## [888,999) [999,1e+06)
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 0 0
## 16 0 0
## 777 0 0
## 888 51 0
## 999 0 11
## 999999 0 1351
## [1] "Frequency table after encoding"
## HC4_2. What is the highest completed education level of your father? [You do not need
## Primary or less (0-5) Lower secondary (6-8)
## 155 80
## Secondary (9-10) SLC (11)
## 48 47
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 29 6
## Literate, but never attended school Illiterate, and never attended school
## 121 111
## Does not apply Don't Know
## 1 51
## NA
## 1362
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Does not apply Don't Know
## 9 10
## NA
## 11
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## CLASS 1 CLASS 2
## 3 13
## CLASS 3 CLASS 4
## 10 21
## CLASS 5 CLASS 6
## 24 13
## CLASS 7 CLASS 8
## 6 20
## CLASS 9 CLASS 10
## 5 9
## SLC CLASS 12/Intermediate level
## 23 5
## Bachelor level Literate, but never attended school
## 1 225
## Illiterate, and never attended school Does not apply
## 259 20
## Don't know
## 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
## 1 3 0 0 0 0 0 0 0 0
## 2 13 0 0 0 0 0 0 0 0
## 3 10 0 0 0 0 0 0 0 0
## 4 21 0 0 0 0 0 0 0 0
## 5 24 0 0 0 0 0 0 0 0
## 6 0 13 0 0 0 0 0 0 0
## 7 0 6 0 0 0 0 0 0 0
## 8 0 20 0 0 0 0 0 0 0
## 9 0 0 5 0 0 0 0 0 0
## 10 0 0 9 0 0 0 0 0 0
## 11 0 0 0 23 0 0 0 0 0
## 12 0 0 0 0 5 0 0 0 0
## 13 0 0 0 0 0 1 0 0 0
## 15 0 0 0 0 0 0 225 0 0
## 16 0 0 0 0 0 0 0 259 0
## 888 0 0 0 0 0 0 0 0 0
## 999 0 0 0 0 0 0 0 0 0
## 999999 0 0 0 0 0 0 0 0 0
## recoded
## [888,999) [999,1e+06)
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 15 0 0
## 16 0 0
## 888 20 0
## 999 0 3
## 999999 0 1351
## [1] "Frequency table after encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
## Primary or less (0-5) Lower secondary (6-8)
## 71 39
## Secondary (9-10) SLC (11)
## 14 23
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 1
## Literate, but never attended school Illiterate, and never attended school
## 225 259
## Don't Know NA
## 20 1354
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Does not apply Don't Know
## 9 10
## NA
## 11
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## CLASS 2 CLASS 3
## 2 2
## CLASS 4 CLASS 5
## 1 2
## CLASS 9 SLC
## 1 1
## CLASS 12/Intermediate level Literate, but never attended school
## 2 29
## Illiterate, and never attended school Don't know
## 56 3
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
## 2 2 0 0 0 0 0 0 0 0
## 3 2 0 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0 0 0
## 5 2 0 0 0 0 0 0 0 0
## 9 0 0 1 0 0 0 0 0 0
## 11 0 0 0 1 0 0 0 0 0
## 12 0 0 0 0 2 0 0 0 0
## 15 0 0 0 0 0 0 29 0 0
## 16 0 0 0 0 0 0 0 56 0
## 999 0 0 0 0 0 0 0 0 0
## 999999 0 0 0 0 0 0 0 0 0
## recoded
## [888,999) [999,1e+06)
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 9 0 0
## 11 0 0
## 12 0 0
## 15 0 0
## 16 0 0
## 999 0 3
## 999999 0 1912
## [1] "Frequency table after encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
## Primary or less (0-5) Secondary (9-10)
## 7 1
## SLC (11) CLASS 12/Intermediate level (12)
## 1 2
## Literate, but never attended school Illiterate, and never attended school
## 29 56
## NA
## 1915
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Does not apply Don't Know
## 9 10
## NA
## 11
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Pre-school/Kindergarten CLASS 1
## 1 31
## CLASS 2 CLASS 3
## 54 71
## CLASS 4 CLASS 5
## 73 149
## CLASS 6 CLASS 7
## 69 85
## CLASS 8 CLASS 9
## 120 84
## CLASS 10 SLC
## 102 296
## CLASS 12/Intermediate level Bachelor level
## 264 62
## Post-Secondary Level (e.g., MA, PhD) Literate, but never attended school
## 9 304
## Illiterate, and never attended school
## 237
## recoded
## [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999)
## 0 1 0 0 0 0 0 0 0 0 0
## 1 31 0 0 0 0 0 0 0 0 0
## 2 54 0 0 0 0 0 0 0 0 0
## 3 71 0 0 0 0 0 0 0 0 0
## 4 73 0 0 0 0 0 0 0 0 0
## 5 149 0 0 0 0 0 0 0 0 0
## 6 0 69 0 0 0 0 0 0 0 0
## 7 0 85 0 0 0 0 0 0 0 0
## 8 0 120 0 0 0 0 0 0 0 0
## 9 0 0 84 0 0 0 0 0 0 0
## 10 0 0 102 0 0 0 0 0 0 0
## 11 0 0 0 296 0 0 0 0 0 0
## 12 0 0 0 0 264 0 0 0 0 0
## 13 0 0 0 0 0 62 0 0 0 0
## 14 0 0 0 0 0 9 0 0 0 0
## 15 0 0 0 0 0 0 304 0 0 0
## 16 0 0 0 0 0 0 0 237 0 0
## recoded
## [999,1e+06)
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## [1] "Frequency table after encoding"
## D_4. What is your highest completed education level? [You do not need to read the re
## Primary or less (0-5) Lower secondary (6-8)
## 379 274
## Secondary (9-10) SLC (11)
## 186 296
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 264 71
## Literate, but never attended school Illiterate, and never attended school
## 304 237
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Does not apply Don't Know
## 9 10
## NA
## 11
# Re-run to check 2-anonimity
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 1400 variables.
## --> Categorical key variables: D_4, IDR3_20, con1A_gender_I1
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## D_4 8 (8) 251.375 (251.375) 71
## IDR3_20 6 (6) 335.167 (335.167) 37
## con1A_gender_I1 2 (2) 1005.500 (1005.500) 997
##
## (71)
## (37)
## (997)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 6 (0.298%)
## - 3-anonymity: 14 (0.696%)
## - 5-anonymity: 51 (2.536%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## # A tibble: 6 x 3
## D_4 IDR3_20 con1A_gender_I1
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 4 [SLC (11)] 6 [65 and older] 2 [female]
## 2 6 [Bachelor/Postgraduate level] 4 [45-54] 1 [male]
## 3 6 [Bachelor/Postgraduate level] 5 [55-64] 2 [female]
## 4 7 [Literate, but never attended school] 1 [15-24] 2 [female]
## 5 7 [Literate, but never attended school] 1 [15-24] 1 [male]
## 6 3 [Secondary (9-10)] 6 [65 and older] 2 [female]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## D_4 IDR3_20 con1A_gender_I1
## 176 NA 6 2
## 307 NA 4 1
## 492 NA 5 2
## 1154 NA 1 2
## 1435 NA 1 1
## 1529 NA 6 2
mydata [notAnon,"D_4"] <- 11
# !!! Identify open-end variables here:
open_ends <- c("H2_12_TEXT_Translation",
"HTNx3_2_14_TEXT_Translation",
"HTN_5x3_TEXT_Translation",
"HTV_1_10_TEXTx3_Translation",
"HTV_3_11_TEXTx3_Translation",
"CPR5i_TEXT_Translation",
"G1_00_08_TEXT_Translation",
"P13A_10_TEXT_Translation",
"P14A_12_TEXT_Translation",
"SIMPOC7A_10_TEXT_Translation",
"P13B_10_TEXT_Translation",
"P14B_12_TEXT_Translation",
"SIMPOC7B_10_TEXT_Translation",
"P13C_10_TEXT_I1_Translation",
"P14C_12_TEXT_I1_Translation",
"SIMPOC7C_10_TEXT_I1_Translation",
"P14C_12_TEXT_I2_Translation",
"P13D_10_TEXT_I1_Translation",
"P14D_12_TEXT_I1_Translation",
"P14D_12_TEXT_I2_Translation",
"P13E_10_TEXT_I1_Translation",
"P14E_12_TEXT_I1_Translation",
"SIMPOC7E_10_TEXT_I1_Translation",
"P14E_12_TEXT_I2_Translation",
"P14E_12_TEXT_I3_Translation",
"NEW_3_12_TEXT_Translation",
"NEW_9_TEXT_Translation",
"SIMPOC7_cl_10_TEXT_I1_Translate",
"SIMPOC7_cl_10_TEXT_I2_Translate",
"NEW_10_TEXT_Translation",
"P13_cl_O3_TEXT_I1_Translation",
"NEW_9_cl_TEXT_I1_Translation",
"NEW_9_cl_TEXT_I2_Translation",
"NEW_9_cl_TEXT_I3_Translation",
"P14_cl_O2_I1_TEXT_Translation",
"P13_cl_O2_TEXT_I2_Translation",
"SIMPOC7_cl_10_TEXT_I3_Translate",
"P14_cl_O1_I3_TEXT_Translation",
"P14_cl_O1_I2_TEXT_Translation",
"e3e_TEXT_Translation",
"E2_11_8_TEXT_Translation",
"E_14_7_TEXT_Translation")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata$E_14_7_TEXT_Translation[1313] <- "Respondent's bother was tricked in bad activities and later threatened to help [activity redacted]"
mydata$E_14_7_TEXT_Translation[1694] <- "In Q64, respondent said there was no income and later in Q307 respondent said [amount redacted] so entered the option more than 12,000 in Q307"
mydata$E_14_7_TEXT_Translation[1907] <- "GPS did not capture for about 20 minutes and started the interview without GPS. In Q64 respondent did not have any income but her/his son sent [amount redacted] the other day"
mydata$NEW_9_TEXT_Translation[1895] <- "Make [ocuppation redacted]"
mydata$NEW_10_TEXT_Translation[1554] <- "Shop [type redacted]"
#mydata <- mydata[!names(mydata) %in% "SrvyrComment"]
# Setup map
countrymap <- map_data("world") %>% filter(region=="Nepal") #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")
# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement.
gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 41 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## Longitude Latitude
## Min. :84.31 Min. :26.85
## 1st Qu.:85.02 1st Qu.:27.55
## Median :85.46 Median :27.64
## Mean :85.34 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :41 NA's :41
## Warning: Removed 41 rows containing missing values (geom_point).
## Warning: Removed 41 rows containing missing values (geom_point).
## Warning: Removed 41 rows containing missing values (geom_point).
## Warning: Removed 41 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## Longitude Latitude
## Min. :84.27 Min. :26.83
## 1st Qu.:85.01 1st Qu.:27.54
## Median :85.46 Median :27.64
## Mean :85.34 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.72
## Max. :86.17 Max. :28.04
## NA's :41 NA's :41
## [1] "Processing time = 5.18069293498993"
gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 40 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:85.03 1st Qu.:27.55
## Median :85.45 Median :27.65
## Mean :85.34 Mean :27.59
## 3rd Qu.:85.61 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :40 NA's :40
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.28 Min. :26.84
## 1st Qu.:85.02 1st Qu.:27.54
## Median :85.45 Median :27.64
## Mean :85.34 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.73
## Max. :86.19 Max. :28.04
## NA's :40 NA's :40
## [1] "Processing time = 5.33818366527557"
gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 281 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :281 NA's :281
## Warning: Removed 281 rows containing missing values (geom_point).
## Warning: Removed 281 rows containing missing values (geom_point).
## Warning: Removed 281 rows containing missing values (geom_point).
## Warning: Removed 281 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gps_CEa_LO gps_CEa_LA
## Min. :84.26 Min. :26.82
## 1st Qu.:84.98 1st Qu.:27.54
## Median :85.44 Median :27.63
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.17 Max. :28.03
## NA's :281 NA's :281
## [1] "Processing time = 4.2291198849678"
gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 274 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.31 Min. :26.85
## 1st Qu.:84.96 1st Qu.:27.56
## Median :85.45 Median :27.64
## Mean :85.33 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.15 Max. :28.00
## NA's :274 NA's :274
## Warning: Removed 274 rows containing missing values (geom_point).
## Warning: Removed 274 rows containing missing values (geom_point).
## Warning: Removed 274 rows containing missing values (geom_point).
## Warning: Removed 274 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gpsenumimp_LO gpsenumimp_LA
## Min. :84.28 Min. :26.84
## 1st Qu.:84.98 1st Qu.:27.55
## Median :85.44 Median :27.64
## Mean :85.32 Mean :27.59
## 3rd Qu.:85.60 3rd Qu.:27.72
## Max. :86.19 Max. :28.04
## NA's :274 NA's :274
## [1] "Processing time = 3.88036923011144"
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))