rm(list=ls(all=t))
filename <- "UC Berkeley_Nepal_Awareness-Raising for Police_Public Data_Final" # !!!Update filename
source ("functions_1.7.R")
## --------
## This is sdcMicro v5.6.0.
## For references, please have a look at citation('sdcMicro')
## Note: since version 5.0.0, the graphical user-interface is a shiny-app that can be started with sdcApp().
## Please submit suggestions and bugs at: https://github.com/sdcTools/sdcMicro/issues
## --------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: sp
## Checking rgeos availability: TRUE
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:sdcMicro':
##
## freq
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## Loading required package: spatstat.data
## Loading required package: spatstat.geom
## spatstat.geom 2.0-1
##
## Attaching package: 'spatstat.geom'
## The following objects are masked from 'package:raster':
##
## area, rotate, shift
## Loading required package: spatstat.core
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:raster':
##
## getData
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: rpart
## spatstat.core 2.0-0
## Loading required package: spatstat.linnet
## spatstat.linnet 2.1-1
##
## spatstat 2.0-1 (nickname: 'Caution: contains small parts')
## For an introduction to spatstat, type 'beginner'
## rgeos version: 0.5-5, (SVN revision 640)
## GEOS runtime version: 3.8.0-CAPI-1.13.1
## Linking to sp version: 1.4-4
## Polygon checking: TRUE
##
## Spatial Point Pattern Analysis Code in S-Plus
##
## Version 2 - Spatial and Space-Time analysis
##
## Attaching package: 'splancs'
## The following object is masked from 'package:raster':
##
## zoom
## The following object is masked from 'package:dplyr':
##
## tribble
## Loading required package: spam
## Loading required package: dotCall64
## Loading required package: grid
## Spam version 2.6-0 (2020-12-14) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## See https://github.com/NCAR/Fields for
## an extensive vignette, other supplements and source code
##
## Attaching package: 'geosphere'
## The following object is masked from 'package:spatstat.geom':
##
## perimeter
##
## Attaching package: 'tibble'
## The following object is masked from 'package:splancs':
##
## tribble
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!No Direct PII variables
!!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
## HTVBadri HTVdeepak HTVKaji HTVPranita HTVTara
## 237 188 184 201 255
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
## 1 2 3 4 5
## 237 188 184 201 255
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("vdc_id", "police_id")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## vdc_id. VDC or Municipality
## Bhaktapur municipality Changunarayan municipality Madhyapur municipality
## 123 39 70
## Suryabinayak municipality Bharatpur municipality Ichyakamana VDC
## 109 128 9
## Kalika municipality Khairahani municipality Madi municipality
## 3 17 13
## Rapti municipality Ratnanagar municipality Benighat Rorang VDC
## 14 28 9
## Dhunibesi municipality Gajuri VDC Galchhi VDC
## 15 18 9
## Ganga Jamuna VDC Jwalamukhi VDC Khaniyabas VDC
## 5 4 6
## Netrawati VDC Nilkantha municipality Siddhalek VDC
## 6 58 3
## Thakre VDC Tripurasundari VDC Banepa municipality
## 3 6 22
## Bethan VDC Bhumlu VDC Chauri deurali VDC
## 2 7 4
## Dhulikhel municipality Khanikhola VDC Mahabharat VDC
## 61 3 7
## Mandan deupur municipality Namobuddha municipality Panauti municipality
## 4 8 8
## Panchkhal Municipality Roshi VDC Timal VDC
## 14 8 2
## Bagmati VDC Bakaiya VDC Bhimfedi VDC
## 16 8 14
## Hetauda municipality Indrasarobar VDC Kailash VDC
## 133 8 6
## Manahari VDC Thaha municipality
## 18 17
## [1] "Frequency table after encoding"
## vdc_id. VDC or Municipality
## 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
## 17 8 4 9 13 14 7 5 14 6 4 8 16 3 58 123 22 8 9 15 3
## 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
## 6 2 18 2 7 3 6 8 39 3 18 28 8 109 17 9 70 6 61 133 128
## 426 427
## 14 4
## [1] "Frequency table before encoding"
## police_id. Police Station
## Byasi police station
## 14
## District police office, Bhaktapur
## 91
## Dubarsquare police station
## 9
## Dudhpati police station
## 9
## Changunarayan police station
## 6
## Duwakot police station
## 7
## Kharipati police station
## 10
## Nagarkot police station
## 8
## Sudal police station
## 8
## Sanothimi police station
## 14
## Thimi police range
## 56
## Balkot police station
## 14
## Dadhikot police station
## 6
## Jagati police range
## 70
## Katunje police station
## 16
## Sirutar police station
## 5
## Chanauli police station
## 4
## Dibyanagar police station
## 4
## Distict Police Office, Chitawan
## 75
## Gitanagar police station
## 8
## Gondrang police station
## 2
## Jugedi police station
## 4
## Narayangadh police station
## 19
## Paras Buspark police station
## 4
## Ramnagar police station
## 4
## Sitaram Chowk police station
## 4
## Mugling police station
## 9
## Jutpani police station
## 3
## Khairahani police station
## 13
## Samjhaut Pipariya police station
## 2
## Baruwa Madi police station
## 2
## Kalyanpur police station
## 2
## Madi police station
## 9
## Bhandara police station
## 12
## Lothar police station
## 2
## Bachhauli police station
## 3
## Mainaha police station
## 2
## Panchakanya police station
## 3
## Tandi police station
## 20
## Benighat police station
## 3
## Chundunga police station
## 2
## Jogimara police station
## 4
## Jiwanpur police station
## 2
## Khanikhola police station
## 13
## Gajuri police station
## 16
## Malekhu police station
## 2
## Baireni police station
## 6
## Galchi police station
## 3
## Budhathum police station
## 2
## Phulkharka police station
## 3
## Maidi police station
## 4
## Dharkha police station
## 6
## Marpak police station
## 2
## Semjom police station
## 2
## Todke police station
## 2
## Dhuwakot police station
## 2
## District Police Office, Dhading
## 49
## Pucharbajar police station
## 2
## Sunaulabajar police station
## 5
## Nalang police station
## 3
## Mahadevbesi police station
## 3
## Salyankot police station
## 3
## Salyantar police station
## 3
## Banepa police station
## 17
## Nala police station
## 2
## Sanga police station
## 3
## Chyamrangbesi police station (Dhungakharka)
## 2
## Chaubas police station
## 2
## Dolalghat police station
## 3
## Phalate police station
## 2
## Kattikedeurali police station
## 4
## District Police Office, Kavre
## 58
## Kavrebhanjyang police station
## 3
## Taaldhunga police station
## 3
## Gokule police station
## 4
## Phoksingtar police station
## 3
## Deupur police station
## 2
## Higuwapati police station
## 2
## Machchepauwa police station (Bhakunde)
## 4
## Shikharpur police station (Kanpur)
## 2
## Shyampati police station
## 2
## Khopasi police station
## 2
## Kushadevi police station
## 2
## Panauti police station
## 4
## Baluwa police station
## 2
## Bhagawatisthan police station
## 4
## Khawa police station
## 2
## Panchkhal police station
## 4
## Timalbesi police station
## 2
## Mangaltar police station
## 6
## Pachuwarghat police station
## 2
## Bhorleni police station
## 2
## Jhurjhure police station
## 3
## Phaparbari police station
## 7
## Shiwaraigaun police station
## 4
## Hattisude police station
## 3
## Thingan police station
## 5
## Bhaise police station
## 3
## Bhimfedi police station
## 11
## Basamadi police station
## 3
## Bhutandevi police station
## 4
## Buspark police station hetauda
## 3
## Chaughada police station
## 4
## Chaukitole police station
## 4
## Hetauda police station
## 25
## Newarpani police station
## 6
## Padampokhari police station
## 4
## Pashupatinagar police station
## 13
## Thulodamar police station
## 4
## District police office, Makawanpur
## 65
## Fakhel police station
## 4
## Sisneri police station
## 4
## Bharta police station
## 3
## Dandakharka police station
## 3
## Manahari police station
## 16
## Sunachuri police station
## 2
## Chitlang police station
## 4
## Palung police station
## 11
## Simbhanjyang police station
## 2
## [1] "Frequency table after encoding"
## police_id. Police Station
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
## 6 2 2 2 13 2 4 4 16 4 4 2 2 11 6 2 3 2 8 2 20
## 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
## 16 4 2 8 13 7 2 4 3 2 2 4 6 56 3 5 75 9 10 7 6
## 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
## 4 2 8 70 2 3 25 3 2 13 2 5 3 3 2 4 2 14 2 3 2
## 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
## 9 3 4 6 2 9 4 3 2 4 4 3 3 4 5 3 2 4 2 49 11
## 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
## 2 3 2 91 2 3 2 14 4 14 4 3 3 4 3 3 9 4 2 12 17
## 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
## 6 3 4 2 3 4 2 3 19 14 4 58 4 2 65
# Recode birth year into 5 year groups.
break_year <- c(999,2020,2025,2030,2035,2040,2045,2050, 2055)
labels_year <- c("Missing" =1,
"2020-2024" =2,
"2025-2029" =3,
"2030-2034" =4,
"2035-2039" =5,
"2040-2044" =6,
"2045-2049" =7,
"2050-2054" =8)
mydata <- ordinal_recode (variable="dem4", break_points=break_year, missing=999999, value_labels=labels_year)
## [1] "Frequency table before encoding"
## dem4. What year were you born? [Answer in Nepali Years] [Use the timeline in the manu
## 999 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036
## 2 2 3 3 6 11 10 4 12 13 14 14 18 13 28 20 39
## 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053
## 40 51 66 69 73 78 63 94 43 40 41 54 43 57 15 15 11
## recoded
## [999,2020) [2020,2025) [2025,2030) [2030,2035) [2035,2040) [2040,2045)
## 999 2 0 0 0 0 0
## 2021 0 2 0 0 0 0
## 2022 0 3 0 0 0 0
## 2023 0 3 0 0 0 0
## 2024 0 6 0 0 0 0
## 2025 0 0 11 0 0 0
## 2026 0 0 10 0 0 0
## 2027 0 0 4 0 0 0
## 2028 0 0 12 0 0 0
## 2029 0 0 13 0 0 0
## 2030 0 0 0 14 0 0
## 2031 0 0 0 14 0 0
## 2032 0 0 0 18 0 0
## 2033 0 0 0 13 0 0
## 2034 0 0 0 28 0 0
## 2035 0 0 0 0 20 0
## 2036 0 0 0 0 39 0
## 2037 0 0 0 0 40 0
## 2038 0 0 0 0 51 0
## 2039 0 0 0 0 66 0
## 2040 0 0 0 0 0 69
## 2041 0 0 0 0 0 73
## 2042 0 0 0 0 0 78
## 2043 0 0 0 0 0 63
## 2044 0 0 0 0 0 94
## 2045 0 0 0 0 0 0
## 2046 0 0 0 0 0 0
## 2047 0 0 0 0 0 0
## 2048 0 0 0 0 0 0
## 2049 0 0 0 0 0 0
## 2050 0 0 0 0 0 0
## 2051 0 0 0 0 0 0
## 2052 0 0 0 0 0 0
## 2053 0 0 0 0 0 0
## recoded
## [2045,2050) [2050,2055) [2055,1e+06)
## 999 0 0 0
## 2021 0 0 0
## 2022 0 0 0
## 2023 0 0 0
## 2024 0 0 0
## 2025 0 0 0
## 2026 0 0 0
## 2027 0 0 0
## 2028 0 0 0
## 2029 0 0 0
## 2030 0 0 0
## 2031 0 0 0
## 2032 0 0 0
## 2033 0 0 0
## 2034 0 0 0
## 2035 0 0 0
## 2036 0 0 0
## 2037 0 0 0
## 2038 0 0 0
## 2039 0 0 0
## 2040 0 0 0
## 2041 0 0 0
## 2042 0 0 0
## 2043 0 0 0
## 2044 0 0 0
## 2045 43 0 0
## 2046 40 0 0
## 2047 41 0 0
## 2048 54 0 0
## 2049 43 0 0
## 2050 0 57 0
## 2051 0 15 0
## 2052 0 15 0
## 2053 0 11 0
## [1] "Frequency table after encoding"
## dem4. What year were you born? [Answer in Nepali Years] [Use the timeline in the manu
## Missing 2020-2024 2025-2029 2030-2034 2035-2039 2040-2044 2045-2049 2050-2054
## 2 14 50 87 216 377 221 98
## [1] "Inspect value labels and relabel as necessary"
## Missing 2020-2024 2025-2029 2030-2034 2035-2039 2040-2044 2045-2049 2050-2054
## 1 2 3 4 5 6 7 8
# Recode education attainment into standard Nepal categories
break_edu <- c(0,7,10,12,13,14,16,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
"Lower secondary (6-8)" = 2,
"Secondary (9-10)" = 3,
"SLC (11)" = 4,
"CLASS 12/Intermediate level (12)" = 5,
"Bachelor/Postgraduate level" = 6,
"Literate, but never attended school" = 7,
"Illiterate, and never attended school"= 8,
"Refused"= 9,
"Does not apply" = 10,
"Don't Know" = 11)
mydata <- ordinal_recode (variable="dem6", break_points=break_edu, missing=999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## dem6. What is your highest completed education level? [You do not need to read the r
## Class 1 Class 2
## 1 1
## Class 3 Class 4
## 2 11
## Class 5 Class 6
## 56 17
## Class 7 Class 8
## 75 94
## Class 9 Class 10
## 35 95
## slc Class 12/Intermediate Level
## 300 264
## Bachelor Level Master Level
## 82 31
## Illiterate, and never attended school
## 1
## recoded
## [0,7) [7,10) [10,12) [12,13) [13,14) [14,16) [16,17) [17,18) [18,777) [777,888)
## 2 1 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0 0
## 4 2 0 0 0 0 0 0 0 0 0
## 5 11 0 0 0 0 0 0 0 0 0
## 6 56 0 0 0 0 0 0 0 0 0
## 7 0 17 0 0 0 0 0 0 0 0
## 8 0 75 0 0 0 0 0 0 0 0
## 9 0 94 0 0 0 0 0 0 0 0
## 10 0 0 35 0 0 0 0 0 0 0
## 11 0 0 95 0 0 0 0 0 0 0
## 12 0 0 0 300 0 0 0 0 0 0
## 13 0 0 0 0 264 0 0 0 0 0
## 14 0 0 0 0 0 82 0 0 0 0
## 15 0 0 0 0 0 31 0 0 0 0
## 17 0 0 0 0 0 0 0 1 0 0
## recoded
## [888,999) [999,1e+03)
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 0 0
## 17 0 0
## [1] "Frequency table after encoding"
## dem6. What is your highest completed education level? [You do not need to read the r
## Primary or less (0-5) Lower secondary (6-8)
## 71 186
## Secondary (9-10) SLC (11)
## 130 300
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 264 113
## Illiterate, and never attended school
## 1
## [1] "Inspect value labels and relabel as necessary"
## Primary or less (0-5) Lower secondary (6-8)
## 1 2
## Secondary (9-10) SLC (11)
## 3 4
## CLASS 12/Intermediate level (12) Bachelor/Postgraduate level
## 5 6
## Literate, but never attended school Illiterate, and never attended school
## 7 8
## Refused Does not apply
## 9 10
## Don't Know
## 11
mydata <- top_recode ("dem9", break_point=2, missing=999999) # Topcode cases with 2 or more children, as there are only 16 cases with 3 or more.
## [1] "Frequency table before encoding"
## dem9. How many sons do you have? [Open-ended]
## 0 1 2 3 4
## 396 483 170 14 2
## [1] "Frequency table after encoding"
## dem9. How many sons do you have? [Open-ended]
## 0 1 2 or more
## 396 483 186
mydata <- top_recode ("dem10", break_point=2, missing=999999) # Topcode cases with 2 or more children, as there are only 36 cases with 3 or more.
## [1] "Frequency table before encoding"
## dem10. How many daughters do you have? [Open-ended]
## 0 1 2 3 4 5
## 499 401 129 29 2 5
## [1] "Frequency table after encoding"
## dem10. How many daughters do you have? [Open-ended]
## 0 1 2 or more
## 499 401 165
# Top code high income to the 99.5 percentile
percentile_99.5 <- floor(quantile(mydata$dem11[mydata$dem11>999], probs = c(0.995)))
mydata <- top_recode (variable="dem11", break_point=percentile_99.5, missing=999)
## [1] "Frequency table before encoding"
## dem11. In a typical month, approximately what is your household's cash income? (in NRS)
## 777 888 999 10000 12500 15000 16000 16500 17000 17200 17210 17230
## 1 1 5 4 1 11 8 1 113 3 1 1
## 17330 17500 17550 18000 18235 18300 18340 18500 18600 18650 19000 20000
## 1 8 1 43 1 1 2 1 1 1 5 70
## 21000 22000 22200 22500 22600 22630 23000 23500 23800 24000 25000 26000
## 8 74 1 4 1 1 51 3 1 14 69 8
## 27000 27500 28000 28500 29000 30000 32000 33000 34000 35000 35500 36000
## 10 2 10 1 4 82 11 3 10 55 1 3
## 37000 38000 39000 40000 42000 43000 44000 45000 47000 48000 50000 52000
## 3 1 2 90 7 1 5 37 2 1 74 1
## 55000 56000 58000 60000 65000 68000 70000 75000 80000 85000 90000 1e+05
## 8 2 1 34 2 1 14 4 10 3 4 21
## 120000 125000 130000 150000 155000 160000 175000 2e+05 250000 4e+05
## 2 1 2 8 1 1 1 7 1 1
## [1] "Frequency table after encoding"
## dem11. In a typical month, approximately what is your household's cash income? (in NRS)
## 777 888 999 10000 12500 15000
## 1 1 5 4 1 11
## 16000 16500 17000 17200 17210 17230
## 8 1 113 3 1 1
## 17330 17500 17550 18000 18235 18300
## 1 8 1 43 1 1
## 18340 18500 18600 18650 19000 20000
## 2 1 1 1 5 70
## 21000 22000 22200 22500 22600 22630
## 8 74 1 4 1 1
## 23000 23500 23800 24000 25000 26000
## 51 3 1 14 69 8
## 27000 27500 28000 28500 29000 30000
## 10 2 10 1 4 82
## 32000 33000 34000 35000 35500 36000
## 11 3 10 55 1 3
## 37000 38000 39000 40000 42000 43000
## 3 1 2 90 7 1
## 44000 45000 47000 48000 50000 52000
## 5 37 2 1 74 1
## 55000 56000 58000 60000 65000 68000
## 8 2 1 34 2 1
## 70000 75000 80000 85000 90000 1e+05
## 14 4 10 3 4 21
## 120000 125000 130000 150000 155000 160000
## 2 1 2 8 1 1
## 175000 2e+05 or more
## 1 9
# Group year of employment into 5-year groups.
break_year <- c(0, seq(2050, 2075, 5))
labels_year <- c("Before 2050"= 1,
"2050-2054" = 2,
"2055-2059" = 3,
"2060-2064" = 4,
"2065-2069" = 5,
"2070-2074" = 6)
mydata <- ordinal_recode (variable="emp1y", break_points=break_year, missing=999999, value_labels=labels_year)
## [1] "Frequency table before encoding"
## emp1y. Year
## 2029 2030 2036 2041 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
## 1 1 1 1 1 2 10 9 10 18 8 12 18 12 17 27 17
## 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2069 2070 2071 2072 2073
## 33 76 63 51 78 67 134 91 2 20 53 93 78 3 54 4
## recoded
## [0,2050) [2050,2055) [2055,2060) [2060,2065) [2065,2070) [2070,2075)
## 2029 1 0 0 0 0 0
## 2030 1 0 0 0 0 0
## 2036 1 0 0 0 0 0
## 2041 1 0 0 0 0 0
## 2044 1 0 0 0 0 0
## 2045 2 0 0 0 0 0
## 2046 10 0 0 0 0 0
## 2047 9 0 0 0 0 0
## 2048 10 0 0 0 0 0
## 2049 18 0 0 0 0 0
## 2050 0 8 0 0 0 0
## 2051 0 12 0 0 0 0
## 2052 0 18 0 0 0 0
## 2053 0 12 0 0 0 0
## 2054 0 17 0 0 0 0
## 2055 0 0 27 0 0 0
## 2056 0 0 17 0 0 0
## 2057 0 0 33 0 0 0
## 2058 0 0 76 0 0 0
## 2059 0 0 63 0 0 0
## 2060 0 0 0 51 0 0
## 2061 0 0 0 78 0 0
## 2062 0 0 0 67 0 0
## 2063 0 0 0 134 0 0
## 2064 0 0 0 91 0 0
## 2065 0 0 0 0 2 0
## 2066 0 0 0 0 20 0
## 2067 0 0 0 0 53 0
## 2069 0 0 0 0 93 0
## 2070 0 0 0 0 0 78
## 2071 0 0 0 0 0 3
## 2072 0 0 0 0 0 54
## 2073 0 0 0 0 0 4
## recoded
## [2075,1e+06)
## 2029 0
## 2030 0
## 2036 0
## 2041 0
## 2044 0
## 2045 0
## 2046 0
## 2047 0
## 2048 0
## 2049 0
## 2050 0
## 2051 0
## 2052 0
## 2053 0
## 2054 0
## 2055 0
## 2056 0
## 2057 0
## 2058 0
## 2059 0
## 2060 0
## 2061 0
## 2062 0
## 2063 0
## 2064 0
## 2065 0
## 2066 0
## 2067 0
## 2069 0
## 2070 0
## 2071 0
## 2072 0
## 2073 0
## [1] "Frequency table after encoding"
## emp1y. Year
## Before 2050 2050-2054 2055-2059 2060-2064 2065-2069 2070-2074
## 54 67 216 421 168 139
## [1] "Inspect value labels and relabel as necessary"
## Before 2050 2050-2054 2055-2059 2060-2064 2065-2069 2070-2074
## 1 2 3 4 5 6
# !!!Include relevant variables in list below
indirect_PII <- c("list_id",
"police_rank",
"dem3",
"dem5",
"dem7",
"dem8",
"emp20_1",
"emp20_2",
"emp20_5",
"govlaw0",
"enumimp1_a",
"enumimp1_b",
"enumimp1_c")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
# Top code high police ranks, as only a few "Superintendent of Police (SP)", "Deputy Superintendent of Police (DSP)", or "Inspector\t"
break_rank <- c(0, 4, 5, 6, 7,8)
labels_rank <- c("Superintendent, Deputy Superintendent, or Inspector"= 1,
"Sub Inspector" = 2,
"Assistant Sub Inspector" = 3,
"Head Constable" = 4,
"Constable" = 5)
mydata <- ordinal_recode (variable="police_rank", break_points=break_rank, missing=999999, value_labels=labels_rank)
## [1] "Frequency table before encoding"
## police_rank. What is your police rank?
## Superintendent of Police (SP) Deputy Superintendent of Police (DSP)
## 5 8
## Inspector\t Sub Inspector\t\t
## 18 65
## Assistant Sub Inspector\t\t Head Constable\t\t
## 132 237
## Constable\t
## 600
## recoded
## [0,4) [4,5) [5,6) [6,7) [7,8) [8,1e+06)
## 1 5 0 0 0 0 0
## 2 8 0 0 0 0 0
## 3 18 0 0 0 0 0
## 4 0 65 0 0 0 0
## 5 0 0 132 0 0 0
## 6 0 0 0 237 0 0
## 7 0 0 0 0 600 0
## [1] "Frequency table after encoding"
## police_rank. What is your police rank?
## Superintendent, Deputy Superintendent, or Inspector
## 31
## Sub Inspector
## 65
## Assistant Sub Inspector
## 132
## Head Constable
## 237
## Constable
## 600
## [1] "Inspect value labels and relabel as necessary"
## Superintendent, Deputy Superintendent, or Inspector
## 1
## Sub Inspector
## 2
## Assistant Sub Inspector
## 3
## Head Constable
## 4
## Constable
## 5
mydata <- ordinal_recode (variable="enumimp1_c", break_points=break_rank, missing=999999, value_labels=labels_rank)
## [1] "Frequency table before encoding"
## enumimp1_c. What is the police rank of the respondent?
## Superintendent of Police (SP) Deputy Superintendent of Police (DSP)
## 5 8
## Inspector\t Sub Inspector\t\t
## 18 65
## Assistant Sub Inspector\t\t Head Constable\t\t
## 132 237
## Constable\t
## 600
## recoded
## [0,4) [4,5) [5,6) [6,7) [7,8) [8,1e+06)
## 1 5 0 0 0 0 0
## 2 8 0 0 0 0 0
## 3 18 0 0 0 0 0
## 4 0 65 0 0 0 0
## 5 0 0 132 0 0 0
## 6 0 0 0 237 0 0
## 7 0 0 0 0 600 0
## [1] "Frequency table after encoding"
## enumimp1_c. What is the police rank of the respondent?
## Superintendent, Deputy Superintendent, or Inspector
## 31
## Sub Inspector
## 65
## Assistant Sub Inspector
## 132
## Head Constable
## 237
## Constable
## 600
## [1] "Inspect value labels and relabel as necessary"
## Superintendent, Deputy Superintendent, or Inspector
## 1
## Sub Inspector
## 2
## Assistant Sub Inspector
## 3
## Head Constable
## 4
## Constable
## 5
# Encode caste and political party, as many small groups
mydata <- encode_location (variables= "dem5", missing=999999)
## [1] "Frequency table before encoding"
## dem5. What is your ethnic background? [You do not need to read the response choices.]
## Chhetri Brahmin (Hill) Magar Tharu
## 279 137 69 66
## Tamang Newar Muslim Kami
## 67 64 5 14
## Yadav Rai Gurung Damain/Dholi
## 67 15 21 18
## Limbu Thakuri Sarki Teli
## 2 15 6 22
## Chamar/Harijan/Ram Koiri Kurmi Dusadh/Paswan/Pasi
## 5 7 20 7
## Sonar Brahman (Tarai) Gharti/Bhujel Kalwar
## 4 22 7 1
## Kumal Hajam/Thakur Sudhi Lohar
## 15 6 1 1
## Tatma Majhi Nuniya Kumhar
## 5 9 2 2
## Danuwar Chepang/Praja Haluwai Rajput
## 3 7 2 5
## Darai Pahari Bote Adibasi/Janajati
## 3 1 1 3
## Badi Other Caste Don't know
## 1 57 1
## [1] "Frequency table after encoding"
## dem5. What is your ethnic background? [You do not need to read the response choices.]
## 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
## 5 14 2 69 5 7 2 15 64 67 7 2 1 66 2 22 6 67 9 1 18
## 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
## 57 3 21 20 1 7 1 5 1 6 4 15 3 7 3 279 5 15 22 137 1
## 426
## 1
mydata <- encode_location (variables= "govlaw0", missing=999999)
## [1] "Frequency table before encoding"
## govlaw0. If you had to say, which Nepali party do you feel closest to? [You do not nee
## Madheshi Jana Adhikari Forum, Nepal
## 1
## Nepal Communist Party (CPN UML)
## 99
## Nepal Communist Party (Marxist <U+0096> Leninist) \t
## 7
## Nepali Congress Party (NCP) \t
## 74
## Rastriya Prajatantra Party
## 4
## Rastriya Prajatantra Party Nepal
## 1
## Sadbhawana Party
## 1
## Terai <U+0096> Madhesh Loktantrik Party \t
## 2
## Unified Nepal Communist Party (Maoist)
## 11
## Other (please specify): _____
## 2
## I do not feel close to any party
## 822
## Refused to answer
## 28
## <NA>
## 13
## [1] "Frequency table after encoding"
## govlaw0. If you had to say, which Nepali party do you feel closest to? [You do not nee
## 661 662 663 664 665 666 667 668 669 670 671 672 <NA>
## 99 822 7 11 1 2 28 74 1 2 4 1 13
# Recode religion
break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
"Buddhist" = 2,
"Other" = 3,
"Refused" = 4,
"Not applicable" = 5,
"Don't know" = 6)
mydata <- ordinal_recode (variable="dem7", break_points=break_rel, missing=999, value_labels=labels_rel)
## [1] "Frequency table before encoding"
## dem7. What is your religious background? [You do not need to read the response choice
## Hindu Buddhist Islam Kirant Christian
## 986 64 6 1 8
## recoded
## [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
## 1 986 0 0 0 0 0
## 2 0 64 0 0 0 0
## 3 0 0 6 0 0 0
## 4 0 0 1 0 0 0
## 5 0 0 8 0 0 0
## [1] "Frequency table after encoding"
## dem7. What is your religious background? [You do not need to read the response choice
## Hindu Buddhist Other
## 986 64 15
## [1] "Inspect value labels and relabel as necessary"
## Hindu Buddhist Other Refused Not applicable
## 1 2 3 4 5
## Don't know
## 6
# Recode marital status
break_mar <- c(1,2,777, 888, 999)
labels_mar <- c("Married" = 1,
"Other" = 2,
"Refused" = 3,
"Not applicable" = 4,
"Don't know" = 5)
mydata <- ordinal_recode (variable="dem8", break_points=break_mar, missing=999, value_labels=labels_mar)
## [1] "Frequency table before encoding"
## dem8. What is your marital status?
## Married Separated/Divorced Widowed Never Married
## 930 4 2 129
## recoded
## [1,2) [2,777) [777,888) [888,999) [999,1e+03)
## 1 930 0 0 0 0
## 2 0 4 0 0 0
## 3 0 2 0 0 0
## 4 0 129 0 0 0
## [1] "Frequency table after encoding"
## dem8. What is your marital status?
## Married Other
## 930 135
## [1] "Inspect value labels and relabel as necessary"
## Married Other Refused Not applicable Don't know
## 1 2 3 4 5
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('dem3', 'police_rank', "dem4") ##!!! Replace with candidate categorical demo vars
# weight variable
selectedWeightVar = c('weight') ##!!! Replace with weight var
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars, weightVar=selectedWeightVar)
sdcInitial
## The input dataset consists of 1065 rows and 314 variables.
## --> Categorical key variables: dem3, police_rank, dem4
## --> Weight variable: weight
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## dem3 2 (2) 532.500 (532.500) 168
## police_rank 5 (5) 213.000 (213.000) 31
## dem4 8 (8) 133.125 (133.125) 2
##
## (168)
## (31)
## (2)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 8 (0.751%)
## - 3-anonymity: 18 (1.690%)
## - 5-anonymity: 39 (3.662%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## # A tibble: 8 x 3
## dem3 police_rank dem4
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [Female] 1 [Superintendent, Deputy Superintendent, or Inspector] 7 [2045-2049]
## 2 2 [Female] 1 [Superintendent, Deputy Superintendent, or Inspector] 3 [2025-2029]
## 3 2 [Female] 3 [Assistant Sub Inspector] 7 [2045-2049]
## 4 1 [Male] 4 [Head Constable] 7 [2045-2049]
## 5 1 [Male] 5 [Constable] 2 [2020-2024]
## 6 2 [Female] 5 [Constable] 5 [2035-2039]
## 7 2 [Female] 2 [Sub Inspector] 3 [2025-2029]
## 8 2 [Female] 3 [Assistant Sub Inspector] 8 [2050-2054]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## dem3 police_rank dem4
## 1 2 1 NA
## 192 2 NA 3
## 389 2 3 NA
## 585 1 4 NA
## 690 1 5 NA
## 720 2 5 NA
## 864 2 2 NA
## 871 2 3 NA
mydata [192,"police_rank"] <- NA
mydata [192,"enumimp1_c"] <- NA
mydata [notAnon,"dem4"] <- NA
mydata [192,"dem4"] <- 3
# Check that 2-anonimity has been achieved.
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars, weightVar=selectedWeightVar)
sdcInitial
## The input dataset consists of 1065 rows and 314 variables.
## --> Categorical key variables: dem3, police_rank, dem4
## --> Weight variable: weight
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## dem3 2 (2) 532.500 (532.500) 168
## police_rank 6 (6) 212.800 (212.800) 30
## dem4 9 (9) 132.250 (132.250) 2
##
## (168)
## (30)
## (2)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 5 (0.469%)
## - 5-anonymity: 21 (1.972%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("start_failure_reason_other",
"start_failure_reason_other_eng",
"mc1_other",
"mc1_other_eng",
"ht25o",
"ht25o_eng",
"ht26o",
"ht26o_eng",
"govlaw0_other",
"govlaw0_other_eng",
"enumimp9_other",
"enumimp7o",
"enumimp7o_eng",
"enumimp10",
"enumimp10_eng",
"poster_read")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata <- mydata[!names(mydata) %in% c("govlaw0_other","govlaw0_other_eng")] # Specific political party mentioned
# Setup map
countrymap <- map_data("world") %>% filter(region=="Nepal") #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")
# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement.
gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 34 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## Longitude Latitude
## Min. :84.25 Min. :27.21
## 1st Qu.:84.89 1st Qu.:27.59
## Median :85.16 Median :27.67
## Mean :85.11 Mean :27.64
## 3rd Qu.:85.43 3rd Qu.:27.68
## Max. :85.80 Max. :28.08
## NA's :34 NA's :34
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## Longitude Latitude
## Min. :84.25 Min. :27.20
## 1st Qu.:84.87 1st Qu.:27.59
## Median :85.17 Median :27.66
## Mean :85.12 Mean :27.64
## 3rd Qu.:85.44 3rd Qu.:27.69
## Max. :85.80 Max. :28.07
## NA's :34 NA's :34
## [1] "Processing time = 2.92298678557078"
gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 30 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.25 Min. :27.21
## 1st Qu.:84.88 1st Qu.:27.60
## Median :85.16 Median :27.67
## Mean :85.11 Mean :27.64
## 3rd Qu.:85.43 3rd Qu.:27.68
## Max. :85.80 Max. :28.08
## NA's :30 NA's :30
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## GPSinitial_LO GPSinitial_LA
## Min. :84.24 Min. :27.20
## 1st Qu.:84.87 1st Qu.:27.59
## Median :85.15 Median :27.66
## Mean :85.11 Mean :27.64
## 3rd Qu.:85.43 3rd Qu.:27.69
## Max. :85.83 Max. :28.10
## NA's :30 NA's :30
## [1] "Processing time = 2.82884380022685"
gps.vars <- c("gpsht13a_LO", "gpsht13a_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 119 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gpsht13a_LO gpsht13a_LA
## Min. :84.25 Min. :27.21
## 1st Qu.:84.90 1st Qu.:27.58
## Median :85.25 Median :27.67
## Mean :85.13 Mean :27.64
## 3rd Qu.:85.43 3rd Qu.:27.68
## Max. :85.80 Max. :28.08
## NA's :119 NA's :119
## Warning: Removed 119 rows containing missing values (geom_point).
## Warning: Removed 119 rows containing missing values (geom_point).
## Warning: Removed 119 rows containing missing values (geom_point).
## Warning: Removed 119 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gpsht13a_LO gpsht13a_LA
## Min. :84.21 Min. :27.18
## 1st Qu.:84.89 1st Qu.:27.58
## Median :85.22 Median :27.65
## Mean :85.13 Mean :27.64
## 3rd Qu.:85.44 3rd Qu.:27.69
## Max. :85.81 Max. :28.09
## NA's :119 NA's :119
## [1] "Processing time = 2.57622871398926"
gps.vars <- c("gpsenumip_LO", "gpsenumip_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 136 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics before displacement"
## gpsenumip_LO gpsenumip_LA
## Min. :84.25 Min. :27.21
## 1st Qu.:84.90 1st Qu.:27.58
## Median :85.25 Median :27.67
## Mean :85.14 Mean :27.64
## 3rd Qu.:85.43 3rd Qu.:27.68
## Max. :85.80 Max. :28.08
## NA's :136 NA's :136
## Warning: Removed 136 rows containing missing values (geom_point).
## Warning: Removed 136 rows containing missing values (geom_point).
## Warning: Removed 136 rows containing missing values (geom_point).
## Warning: Removed 136 rows containing missing values (geom_point).
## [1] "Summary Long/Lat statistics after displacement"
## gpsenumip_LO gpsenumip_LA
## Min. :84.23 Min. :27.17
## 1st Qu.:84.90 1st Qu.:27.58
## Median :85.25 Median :27.65
## Mean :85.14 Mean :27.64
## 3rd Qu.:85.44 3rd Qu.:27.69
## Max. :85.83 Max. :28.10
## NA's :136 NA's :136
## [1] "Processing time = 2.35587553183238"
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))