rm(list=ls(all=t))

Setup and crate dictionary

filename <- "ProblemsAll" # !!!Update filename
source ("functions_1.5.R")

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("LE_reportedby", "flag_reportedby") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("surveyor"))
## [1] "Frequency table before encoding"
## surveyor. Surveyor
##                           alka.adhikari    ambir.raj.kulung         amrita.roka anjana.kumari.dulal 
##                  37                  79                  96                  90                  98 
##     ashish.shrestha bhanu.bhakta.dhakal       dev.raj.nepal dhan.kumari.darlami       gita.maharjan 
##                  82                  77                   2                  85                  99 
##       kamala.sharma        manjula.giri min.kumari.shrestha       nabina.khadka      niraj.shrestha 
##                  79                  99                  86                  80                  85 
##    pramila.shrestha    pratika.shrestha rabischandra.bhatta   ram.kumar.acharya     sajina.shrestha 
##                  77                  85                  87                  88                  73 
##     sandip.shrestha       sapana.gautam     sarita.shrestha     tirtha.maya.rai        yamuna.karki 
##                  97                  80                  99                 105                  86 
## [1] "Frequency table after encoding"
## surveyor. Surveyor
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25 
##  37  79  96  90  98  82  77   2  85  99  79  99  86  80  85  77  85  87  88  73  97  80  99 105  86

Open-ends: review responses for any sensitive information, redact as necessary

open_ends <- c("problem",
               "problemdetails",
               "problemnotes",
               "flag_LE_desc",
               "Flag_note",
               "L2_Problems",
               "Flag_Income_descriptionL2")

report_open (list_open_ends = open_ends)
## Warning in dir.create(file.path(getwd(), "verbatims"), recursive = TRUE): 'C:\Users\C_Pablo_Diego-
## Rosell\Desktop\Other Projects\Dwight\ILAB PII\Data\FINAL\UC Berkeley_Nepal_Awareness-General
## Public\PublicData_R3\Data\ProblemsAll\verbatims' already exists
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$flag_LE_desc[159] <- "Taken from [Staff name redacted]'s L2 notes: Q 522(list experiment): The interviewer lists the sayings all at once at first. When the participant seems confused then she repeats the sayings one-by-one and asks whether they are applicable to the participant or not. Latest survey version 570. Collected using survey version 565."
mydata$flag_LE_desc[180] <- "Respondent was [55-64] years old and illiterate from remote place. She belongs to backward community and was unable to understand list experiment so respondent asked her by saying yes or no."
mydata$flag_LE_desc[262] <- "[Staff name redacted]'s L2 notes: does not read income options"
mydata$flag_LE_desc[263] <- "Respondent was from backward community [Community name redacted] and was unable to understand list experiment question."

mydata$L2_Problems[180] <- "GPS info missing. Flagged in QC problem sheet by [Staff name redacted]. The interviewer changes the information from the sheet for '0-100 %' scale to make the participant understand the scale and says 'there are 10 women ...', but the response is recorded in percentage. Latest survey version 570. Collected using survey version 565."
mydata$L2_Problems[509] <-  "Interview location name discrepancy: The data show that the interview was conducted in [Location redacted], but the GPS record shows the location as [Location redacted]. Collected using survey version 569."
mydata$L2_Problems[916] <-  "The audio for conjoint 2 [Q271] and Q 281 are silent. But the rest of the interview is very good quality. This interview is accepted after consulting with [Staff name redacted] about the two silent audio files."

mydata$L2_Problems[1070] <- "Interview was conducted in [Location redacted] and the map shows [Location redacted]. They are neighboring VDCs. Others are fine."
mydata$L2_Problems[1101] <- "Interview was conducted in [Location redacted] and the map shows [Location redacted]. They are neighboring VDCs. Others are fine."
mydata$L2_Problems[1102] <- "Interview was conducted in [Location redacted] and the map shows [Location redacted]. They are neighboring VDCs. Others are fine."
mydata$L2_Problems[1175] <- "Interview was conducted in [Location redacted] and the map shows [Location redacted]. They are neighboring VDCs. This is not an issue. Others are fine."
mydata$L2_Problems[1181] <- "This interview shows [Location redacted] in map and the interview was conducted in [Location redacted]. Both are neighboring VDCs. This is not an issue. Conjoint 1 recording is not present. Others are fine."
mydata$L2_Problems[1232] <- "This interview shows different VDC on map as [Location redacted]. According to new structure, [Location redacted] is under [Location redacted]. This is not an issue. Everything else is fine."
mydata$L2_Problems[1347] <- "The respondent was of [25-34] years and was quick in responding to the questions and the interview was short. This interview does not have any problem."
mydata$L2_Problems[1398] <- "This interview has been flagged for short interview duration but this interview has no problem and quality is good. Respondent was of [15-24] years old and was quick in understanding and answering the questions."
mydata$L2_Problems[1400] <- "This interview has been flagged for short interview duration but this interview has no problem and quality is good. Respondent was [15-24] years old and he was quick in answering."
mydata$L2_Problems[1510] <- "QC flag: The survey has no GPS. The survey is reviewed, and, from the audio, it has no problems. But it is not accepted as it needs [Staff name redacted]'s case by case assessment for GPS missing interviews. --> Approved as it does not have any other issues"
mydata$L2_Problems[1539] <- "Respondent was of [15-24] years old and was quick in responding. This interview does not have any problem."
mydata$L2_Problems[1774] <- "The map shows unknown location but when clicked closed to the location it shows exact location as [Location redacted]. This is not an issue. Others are fine. Appoved."
mydata$L2_Problems[1873] <- "The interview location was [Location redacted] and the map shows [Location redacted] as they are neighboring VDCs. This is not an issue. Others are fine."
mydata$L2_Problems[1883] <- "The interview location was [Location redacted] and the map shows [Location redacted] as they are neighboring VDCs. This is not an issue. Others are fine."
mydata$L2_Problems[1892] <- "The interview location was [Location redacted] and the map shows [Location redacted] as they are neighboring VDCs. This is not an issue. Others are fine."

mydata$problemdetails[1506] <- "[Staff name redacted] L2: QC auto flag by runtime script. But the survey is accepted as it does not present any other problems."
mydata$problemdetails[1510] <- "[Staff name redacted] L2:  Auto QC flag. The survey has no GPS. The survey is reviewed, and, from the audio, it has no problems. But it is not accepted as it needs [Staff name redacted]'s case by case assessment for the GPS missing interviews."
mydata$problemdetails[1808] <- "VDC was mistakenly recorded as [Location redacted], which is corrected as [Location redacted] as in correction sheet sent by the enumerator."
mydata$problemdetails[1809] <- "VDC was mistakenly recorded as [Location redacted], which is corrected as [Location redacted] as in correction sheet sent by the enumerator."

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))