rm(list=ls(all=t))
Start Rstudio by double-clicking on "PII_remover_1.5.R" Make sure the following files are in the same folder: "PII_remover_1.5.R" "functions_1.5.R" Dataset to be processed (e.g. "Nepal Round 3_FinalClean.dta")
filename <- "Nepal_HT_Study_Round2_062316" # !!!Update filename
source ("functions_1.5.R")
#mydata <- mydata [1:10,] # remove '#' from #mydata if you want to conduct a fast check on 10 rows.
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No Direct PII
!!!No Direct PII-team #Small locations: Encode locations with pop <100,000 using random large numbers !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("VDC", "ID_4")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## VDC. VDC code
## Barahathawa Dhungrekhola Dhurkauli Lalbandi Malangawa N.P.
## 63 63 59 61 63
## Netraganj Raniganj Sankarpur Bhimeswor N.P. Bocha
## 64 64 61 61 61
## Dandakharka Fasku Katakuti Lamidada Melung
## 62 62 63 62 64
## Pawati Badegau Talramarang BhoteNamlang Irkhu
## 63 63 64 65 62
## Ichok Kadambas Langarche Melamchi Anaikot
## 62 63 62 62 63
## BaluwapatiDeupur ChalalGaneshsthan KalatiBhumidanda MahankalChaur Methinkot
## 62 67 62 61 62
## Patalekhet RaviOpi Balkot Changunarayan Chitapol
## 64 62 59 62 63
## Duwakot Gundu Madhyapur Thimi NP Nankhel Sirutar
## 63 63 66 61 58
## Baireni Dhussa Khari Kiranchok Naubise
## 62 64 62 63 63
## Salyantar SunaulaBazar Thakre Chitlang Churiyamai
## 63 62 63 61 63
## Fakhel Kulekhani Nibuwatar Padampokhari ShreepurChhatiwan
## 62 62 60 65 62
## SisneriMahadevsthan Birendranagar Jutpani Kathar Khairahani
## 63 63 62 63 63
## Padampur Parbatipur Piple Shaktikhor Chhayachhetra
## 62 60 65 62 62
## Damachaur Devsthal Dhanwang Phalawang Sibaratha
## 63 65 62 64 62
## Siddheswar Tribeni Baijapur Binauna Chisapani
## 63 61 63 62 65
## Khaskusma Kohalpur Nepalgunj Rajhena Samserganj
## 62 63 60 62 63
## <NA>
## 3
## [1] "Frequency table after encoding"
## VDC. VDC code
## 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 63 63 59 61 63 64 64 61 61 61 62 62 63 62 64 63 63 64 65 62 62 63 62
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
## 62 63 62 67 62 61 62 64 62 59 62 63 63 63 66 61 58 62 64 62 63 63 63
## 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
## 62 63 61 63 62 62 60 65 62 63 63 62 63 63 62 60 65 62 62 63 65 62 64
## 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 <NA>
## 62 63 61 63 62 65 62 63 60 62 63 3
## [1] "Frequency table before encoding"
## ID_4. Ward Number
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 650 638 394 576 404 650 528 423 569 25 28 38 38 38 3
## [1] "Frequency table after encoding"
## ID_4. Ward Number
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 <NA>
## 650 638 394 576 404 650 528 423 569 25 28 38 38 38 3
# !!! No indirect PII - Ordinal
# !!! No indirect PII - Categorical
# !!! No indirect or direct PII
# !!! No open-ends
# !!! No GPS
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))