#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_9" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)


#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed
# !!!No Direct PII

#'# Direct PII-team: Encode field team names
# !!!No Direct PII-team

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No small locations

#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 


# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q1)[na.exclude(mydata$s9q1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q2)[na.exclude(mydata$s9q2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q3)[na.exclude(mydata$s9q3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q3", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q4)[na.exclude(mydata$s9q4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q4", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q5)[na.exclude(mydata$s9q5)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q5", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q6)[na.exclude(mydata$s9q6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q6", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q7)[na.exclude(mydata$s9q7)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q7", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q8)[na.exclude(mydata$s9q8)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q8", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q9)[na.exclude(mydata$s9q9)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q9", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q10)[na.exclude(mydata$s9q10)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q10", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q11)[na.exclude(mydata$s9q11)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q11", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q12)[na.exclude(mydata$s9q12)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q12", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q13)[na.exclude(mydata$s9q13)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q13", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q14)[na.exclude(mydata$s9q14)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q14", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q15other)[na.exclude(mydata$s9q15other)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q15other", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q32)[na.exclude(mydata$s9q32)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q32", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q33)[na.exclude(mydata$s9q33)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q33", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q34)[na.exclude(mydata$s9q34)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q34", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q35)[na.exclude(mydata$s9q35)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q35", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q36)[na.exclude(mydata$s9q36)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q36", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q37)[na.exclude(mydata$s9q37)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q37", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q38)[na.exclude(mydata$s9q38)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q38", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q39)[na.exclude(mydata$s9q39)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q39", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q40)[na.exclude(mydata$s9q40)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q40", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q41)[na.exclude(mydata$s9q41)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q41", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q42)[na.exclude(mydata$s9q42)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q42", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q43)[na.exclude(mydata$s9q43)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q43", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q44)[na.exclude(mydata$s9q44)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q44", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q45)[na.exclude(mydata$s9q45)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q45", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q46)[na.exclude(mydata$s9q46)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q46", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q47)[na.exclude(mydata$s9q47)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q47", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q48)[na.exclude(mydata$s9q48)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q48", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q49)[na.exclude(mydata$s9q49)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q49", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q50)[na.exclude(mydata$s9q50)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q50", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q51)[na.exclude(mydata$s9q51)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q51", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q52)[na.exclude(mydata$s9q52)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q52", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q73)[na.exclude(mydata$s9q73)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q73", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q75)[na.exclude(mydata$s9q75)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q75", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q76)[na.exclude(mydata$s9q76)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q76", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q77)[na.exclude(mydata$s9q77)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q77", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q78)[na.exclude(mydata$s9q78)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q78", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q79)[na.exclude(mydata$s9q79)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q79", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q80)[na.exclude(mydata$s9q80)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q80", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s9q81)[na.exclude(mydata$s9q81)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s9q81", break_point=percentile_99.5, missing=999999)

#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("s9q42a",
                  "s9q43a",
                  "s9q44a",
                  "s9q45a",
                  "s9q46a",
                  "s9q47a",
                  "s9q48a",
                  "s9q49a",
                  "s9q51a",
                  "s9q52a",
                  "s9q1a",
                  "s9q2a",
                  "s9q3a",
                  "s9q4a",
                  "s9q5a",
                  "s9q6a",
                  "s9q7a",
                  "s9q8a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a",
                  "s9q9a")
capture_tables (indirect_PII)

# !!!No data with specific values. 


#'# Matching and crosstabulations: Run automated PII check 
# !!! Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("s9q1awhynoresponse",
               "s9q1whynoresponse",
               "s9q2awhynoresponse",
               "s9q2whynoresponse",
               "s9q3awhynoresponse",
               "s9q3whynoresponse",
               "s9q4awhynoresponse",
               "s9q4whynoresponse",
               "s9q5awhynoresponse",
               "s9q5whynoresponse",
               "s9q6awhynoresponse",
               "s9q6whynoresponse",
               "s9q7awhynoresponse",
               "s9q7whynoresponse",
               "s9q8awhynoresponse",
               "s9q8whynoresponse",
               "s9q9awhynoresponse",
               "s9q9whynoresponse",
               "s9q10awhynoresponse",
               "s9q10whynoresponse",
               "s9q11awhynoresponse",
               "s9q11whynoresponse",
               "s9q12awhynoresponse",
               "s9q12whynoresponse",
               "s9q13awhynoresponse",
               "s9q13whynoresponse",
               "s9q14awhynoresponse",
               "s9q15",
               "s9q14whynoresponse",
               "s9q15otherwhynoresponse",
               "s9q32awhynoresponse",
               "s9q32whynoresponse",
               "s9q33awhynoresponse",
               "s9q33whynoresponse",
               "s9q34awhynoresponse",
               "s9q34whynoresponse",
               "s9q35awhynoresponse",
               "s9q35whynoresponse",
               "s9q36awhynoresponse",
               "s9q36whynoresponse",
               "s9q37awhynoresponse",
               "s9q37whynoresponse",
               "s9q38awhynoresponse",
               "s9q38whynoresponse",
               "s9q39awhynoresponse",
               "s9q39whynoresponse",
               "s9q40awhynoresponse",
               "s9q40whynoresponse",
               "s9q41awhynoresponse",
               "s9q41whynoresponse",
               "s9q42awhynoresponse",
               "s9q42whynoresponse",
               "s9q43awhynoresponse",
               "s9q43whynoresponse",
               "s9q44awhynoresponse",
               "s9q44whynoresponse",
               "s9q45awhynoresponse",
               "s9q45whynoresponse",
               "s9q46awhynoresponse",
               "s9q46whynoresponse",
               "s9q47awhynoresponse",
               "s9q47whynoresponse",
               "s9q48awhynoresponse",
               "s9q48whynoresponse",
               "s9q49awhynoresponse",
               "s9q49whynoresponse",
               "s9q50awhynoresponse",
               "s9q50whynoresponse",
               "s9q51awhynoresponse",
               "s9q51whynoresponse",
               "s9q52awhynoresponse",
               "s9q52whynoresponse",
               "s9q74",
               "s9q73whynoresponse",
               "s9q75whynoresponse",
               "s9q76whynoresponse",
               "s9q77whynoresponse",
               "s9q78whynoresponse",
               "s9q79whynoresponse",
               "s9q80whynoresponse",
               "s9q81whynoresponse")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$s9q2whynoresponse[137] <- "[language]"
mydata$s9q5whynoresponse[63] <- "[language]"
mydata$s9q8whynoresponse[63] <- "[language]"
mydata$s9q14awhynoresponse[1088] <- "[language]"

mydata$s9q15[3] <- "[language]"
mydata$s9q15[19] <- "[language]"
mydata$s9q15[181] <- "[language]"
mydata$s9q15[183] <- "[language]"
mydata$s9q15[238] <- "[language]"
mydata$s9q15[541] <- "[language]"
mydata$s9q15[744] <- "[language]"
mydata$s9q15[1109] <- "[language]"
mydata$s9q15[1134] <- "[language]"
mydata$s9q15[1447] <- "[language]"
mydata$s9q15[1499] <- "[language]"
mydata$s9q15[1501] <- "[language]"
mydata$s9q15[1734] <- "[language]"

mydata$s9q32awhynoresponse[1102] <- "[name] is not informed how much his Son [name] is spending on load."
mydata$s9q40awhynoresponse[1294] <- "[person] is paying"

mydata$s9q74[176] <- "[amount redacted]"
mydata$s9q74[649] <- "Graduation fee and expenses of [name]. Uniform, shoes"
mydata$s9q74[884] <- "[amount redacted]"
mydata$s9q74[1109] <- "[amount redacted]"
mydata$s9q74[1175] <- "[amount redacted]"
mydata$s9q74[1268] <- "[amount redacted]"
mydata$s9q74[1355] <- "[amount redacted]"
mydata$s9q74[1859] <- "[amount redacted]"
mydata$s9q74[1874] <- "[amount redacted]"
mydata$s9q74[1961] <- "[amount redacted]"
mydata$s9q74[2007] <- "[amount redacted]"
mydata$s9q74[857] <- "[amount redacted] of rice"
mydata$s9q74[1569] <- "[amount redacted] for house materials"
mydata$s9q74[40] <- "[language]"
mydata$s9q74[507] <- "[language]"
mydata$s9q74[1054] <- "[language]"
mydata$s9q74[1096] <- "[language]"
mydata$s9q74[1111] <- "[language]"
mydata$s9q74[1443] <- "[language]"
mydata$s9q74[1461] <- "[language]"
mydata$s9q74[1472] <- "[language]"
mydata$s9q74[1501] <- "[language]"
mydata$s9q74[1735] <- "[language]"
mydata$s9q74[504] <- "Vaccine for [name]"
mydata$s9q74[665] <- "[illness] from her husband."
mydata$s9q74[1045] <- "Fare going to [city]"
mydata$s9q74[1187] <- "Hospitalization of [name]"
mydata$s9q74[1203] <- "Medicine, laboratory of Mother [name] and Son [name]"
mydata$s9q74[1333] <- "Fare transportation visiting her child in [city]"
mydata$s9q74[1395] <- "Medical expenses of her child who has [illness] and yhe other child who had [illness]"
mydata$s9q74[1752] <- "Hospitalization of [name] last July [year]."
mydata$s9q74[1853] <- "Hospitalization for [name]"
mydata$s9q74[1120] <- "Medicine of [name]"
mydata$s9q74[1049] <- "For Requirements and payment of her son studying in [small location]"

#'# GPS data: Displace
# !!!No GPS data


#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
