#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_10" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)

#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed
# !!!No Direct PII

#'# Direct PII-team: Encode field team names
# !!!No Direct PII-team

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No Small Locations

#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q2)[na.exclude(mydata$eh_s10q2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q4)[na.exclude(mydata$eh_s10q4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q4", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q6)[na.exclude(mydata$eh_s10q6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q6", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q8)[na.exclude(mydata$eh_s10q8)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q8", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q10)[na.exclude(mydata$eh_s10q10)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q10", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q12)[na.exclude(mydata$eh_s10q12)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q12", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q14)[na.exclude(mydata$eh_s10q14)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q14", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q16)[na.exclude(mydata$eh_s10q16)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q16", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q18)[na.exclude(mydata$eh_s10q18)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q18", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q20)[na.exclude(mydata$eh_s10q20)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q20", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q22)[na.exclude(mydata$eh_s10q22)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q22", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q24)[na.exclude(mydata$eh_s10q24)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q24", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q26)[na.exclude(mydata$eh_s10q26)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q26", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q28)[na.exclude(mydata$eh_s10q28)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q28", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q30)[na.exclude(mydata$eh_s10q30)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q30", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q31)[na.exclude(mydata$eh_s10q31)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q31", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q32)[na.exclude(mydata$eh_s10q32)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q32", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q33)[na.exclude(mydata$eh_s10q33)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q33", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q34)[na.exclude(mydata$eh_s10q34)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q34", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q35)[na.exclude(mydata$eh_s10q35)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q35", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q36)[na.exclude(mydata$eh_s10q36)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q36", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q37)[na.exclude(mydata$eh_s10q37)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q37", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q38)[na.exclude(mydata$eh_s10q38)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q38", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q39)[na.exclude(mydata$eh_s10q39)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q39", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q40)[na.exclude(mydata$eh_s10q40)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q40", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q41)[na.exclude(mydata$eh_s10q41)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q41", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q42)[na.exclude(mydata$eh_s10q42)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q42", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q43)[na.exclude(mydata$eh_s10q43)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q43", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q44)[na.exclude(mydata$eh_s10q44)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q44", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q45)[na.exclude(mydata$eh_s10q45)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q45", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q46)[na.exclude(mydata$eh_s10q46)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q46", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q47)[na.exclude(mydata$eh_s10q47)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q47", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q48)[na.exclude(mydata$eh_s10q48)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q48", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q49)[na.exclude(mydata$eh_s10q49)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q49", break_point=percentile_99.5, missing=999999)

mydata <- top_recode (variable="eh_s10q50", break_point=20, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q51)[na.exclude(mydata$eh_s10q51)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q51", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q54)[na.exclude(mydata$eh_s10q54)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q54", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q55)[na.exclude(mydata$eh_s10q55)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q55", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q56)[na.exclude(mydata$eh_s10q56)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q56", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q57)[na.exclude(mydata$eh_s10q57)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q57", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q58)[na.exclude(mydata$eh_s10q58)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q58", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q59)[na.exclude(mydata$eh_s10q59)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q59", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s10q60)[na.exclude(mydata$eh_s10q60)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s10q60", break_point=percentile_99.5, missing=999999)

#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!No Indirect PII - Categorical

#'# Matching and crosstabulations: Run automated PII check 
# !!!Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("eh_s10q29",
               "eh_s10q53")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$eh_s10q29[157] <- "[language]"
mydata$eh_s10q29[652] <- "[language]"
mydata$eh_s10q29[755] <- "[language]"
mydata$eh_s10q29[795] <- "[language]"
mydata$eh_s10q29[828] <- "[language]"
mydata$eh_s10q29[829] <- "[language]"
mydata$eh_s10q29[844] <- "[language]"
mydata$eh_s10q29[862] <- "[language]"
mydata$eh_s10q29[879] <- "[language]"
mydata$eh_s10q29[884] <- "[language]"
mydata$eh_s10q29[891] <- "[language]"
mydata$eh_s10q29[920] <- "[language]"
mydata$eh_s10q29[923] <- "[language]"
mydata$eh_s10q29[924] <- "[language]"
mydata$eh_s10q29[927] <- "[language]"
mydata$eh_s10q29[950] <- "[language]"
mydata$eh_s10q29[966] <- "[language]"
mydata$eh_s10q29[972] <- "[language]"
mydata$eh_s10q29[973] <- "[language]"
mydata$eh_s10q29[974] <- "[language]"
mydata$eh_s10q29[978] <- "[language]"
mydata$eh_s10q29[1009] <- "[language]"
mydata$eh_s10q29[1209] <- "[language]"
mydata$eh_s10q29[1228] <- "[language]"
mydata$eh_s10q29[1389] <- "[language]"
mydata$eh_s10q29[1493] <- "[language]"
mydata$eh_s10q29[1582] <- "[language]"
mydata$eh_s10q29[1616] <- "Food for [name]'s [event]"
mydata$eh_s10q29[1628] <- "[language]"
mydata$eh_s10q29[1651] <- "[language]"
mydata$eh_s10q29[1685] <- "[language]"
mydata$eh_s10q29[1735] <- "[language]"
mydata$eh_s10q29[1845] <- "[language]"
mydata$eh_s10q29[1866] <- "[language]"
mydata$eh_s10q29[1876] <- "[language]"
mydata$eh_s10q29[1919] <- "[language]"
mydata$eh_s10q29[1921] <- "[language]"
mydata$eh_s10q29[1945] <- "[language]"
mydata$eh_s10q29[1958] <- "[language]"
mydata$eh_s10q29[1962] <- "[language]"
mydata$eh_s10q29[1979] <- "[language]"
mydata$eh_s10q29[2000] <- "[language]"
mydata$eh_s10q29[2011] <- "[language]"
mydata$eh_s10q29[2050] <- "[language]"
mydata$eh_s10q29[2052] <- "[language]"
mydata$eh_s10q29[2053] <- "[language]"
mydata$eh_s10q29[2074] <- "[language]"
mydata$eh_s10q29[2119] <- "[language]"
mydata$eh_s10q29[2160] <- "[language]"
mydata$eh_s10q29[2169] <- "[language]"
mydata$eh_s10q29[2235] <- "[language]"
mydata$eh_s10q29[2241] <- "[language]"
mydata$eh_s10q29[2246] <- "[language]"
mydata$eh_s10q29[2268] <- "[language]"
mydata$eh_s10q29[2282] <- "[language]"

mydata$eh_s10q53[5] <- "[language]"
mydata$eh_s10q53[17] <- "materials"
mydata$eh_s10q53[19] <- "licensed ([store])"
mydata$eh_s10q53[28] <- "[amount redacted]"
mydata$eh_s10q53[48] <- "Tuition fee of [name]"
mydata$eh_s10q53[76] <- "[amount redacted]"
mydata$eh_s10q53[101] <- "materials "
mydata$eh_s10q53[113] <- "[language]"
mydata$eh_s10q53[130] <- "[repaires]"
mydata$eh_s10q53[144] <- "[repaires]"
mydata$eh_s10q53[164] <- "[event]"
mydata$eh_s10q53[176] <- "[language]"
mydata$eh_s10q53[185] <- "[training] - [name]"
mydata$eh_s10q53[189] <- "[repaires]"
mydata$eh_s10q53[201] <- "[celebration]"
mydata$eh_s10q53[207] <- "[repaires]"
mydata$eh_s10q53[244] <- "[repaires]"
mydata$eh_s10q53[246] <- "[repaires]"
mydata$eh_s10q53[247] <- "[repaires]"
mydata$eh_s10q53[272] <- "Medicine for [person]"
mydata$eh_s10q53[278] <- "Medical expenses of [person]"
mydata$eh_s10q53[307] <- "[language]"
mydata$eh_s10q53[334] <- "[repaires]"
mydata$eh_s10q53[403] <- "Medical expenses for the [persons]"
mydata$eh_s10q53[430] <- "[event]"
mydata$eh_s10q53[450] <- "[amount redacted]"
mydata$eh_s10q53[487] <- "Gown rent for [event] of her son"
mydata$eh_s10q53[511] <- "Medical expenses for the [person]"
mydata$eh_s10q53[544] <- "[language]"
mydata$eh_s10q53[547] <- "[amount redacted]"
mydata$eh_s10q53[549] <- "[work]"
mydata$eh_s10q53[556] <- "[amount redacted]"
mydata$eh_s10q53[566] <- "[other]"
mydata$eh_s10q53[689] <- "[event]"
mydata$eh_s10q53[767] <- "[language]"
mydata$eh_s10q53[794] <- "Business permit and [repaires]"
mydata$eh_s10q53[828] <- "Medical expenses for [person]"
mydata$eh_s10q53[848] <- "[language]"
mydata$eh_s10q53[866] <- "[language]"
mydata$eh_s10q53[889] <- "[language]"
mydata$eh_s10q53[896] <- "[language]"
mydata$eh_s10q53[897] <- "Travel to [place] and [event] of [name and date]"
mydata$eh_s10q53[931] <- "[language]"
mydata$eh_s10q53[932] <- "[language]"
mydata$eh_s10q53[949] <- "Materials "
mydata$eh_s10q53[954] <- "[language]"
mydata$eh_s10q53[967] <- "[language]"
mydata$eh_s10q53[1029] <- "[language]"
mydata$eh_s10q53[1031] <- "[language]"
mydata$eh_s10q53[1046] <- "[language]"
mydata$eh_s10q53[1062] <- "Travel expenses to relatuve in [place]"
mydata$eh_s10q53[1072] <- "[repaires] and school materials"
mydata$eh_s10q53[1077] <- "For applying work in [place](daughter [name])"
mydata$eh_s10q53[1088] <- "[situation]"
mydata$eh_s10q53[1106] <- "[person] hospitalized"
mydata$eh_s10q53[1115] <- "Field trip - [amount redacted]"
mydata$eh_s10q53[1194] <- "School project of [name]"
mydata$eh_s10q53[1196] <- "Medical expenses for [people]"
mydata$eh_s10q53[1264] <- "Fare from ([places])"
mydata$eh_s10q53[1277] <- " (Work Requirements)"
mydata$eh_s10q53[1282] <- "[medical expenses]"
mydata$eh_s10q53[1388] <- "[amount redacted]"
mydata$eh_s10q53[1407] <- "[repaires]"
mydata$eh_s10q53[1462] <- "Medical Expense on [date]"
mydata$eh_s10q53[1512] <- "[amount redacted]"
mydata$eh_s10q53[1519] <- "[amount redacted]"
mydata$eh_s10q53[1540] <- "[language]"
mydata$eh_s10q53[1581] <- "Transportation from [place] going to [place] to [purpose]"
mydata$eh_s10q53[1587] <- "[repaires]"
mydata$eh_s10q53[1627] <- "graduation"
mydata$eh_s10q53[1651] <- "[materials]"
mydata$eh_s10q53[1666] <- "[amount redacted]"
mydata$eh_s10q53[1694] <- "Graduation"
mydata$eh_s10q53[1713] <- "[materials]"
mydata$eh_s10q53[1727] <- "[language]"
mydata$eh_s10q53[1737] <- "Helping with the burial of [person]"
mydata$eh_s10q53[1738] <- "[language]"
mydata$eh_s10q53[1754] <- "[repaires]"
mydata$eh_s10q53[1757] <- "[language]"
mydata$eh_s10q53[1761] <- "[event]"
mydata$eh_s10q53[1785] <- "[repaires]"
mydata$eh_s10q53[1808] <- "Enrolment fee [amount redacted] scholar [amount redacted] per sem"
mydata$eh_s10q53[1811] <- "[repaires]"
mydata$eh_s10q53[1821] <- "[amount redacted]"
mydata$eh_s10q53[1843] <- "[repaires]"
mydata$eh_s10q53[1856] <- "For medication of [person] and graduation "
mydata$eh_s10q53[1897] <- "[illness]"
mydata$eh_s10q53[1908] <- "[amount redacted]"
mydata$eh_s10q53[1916] <- "[amount redacted]"
mydata$eh_s10q53[1919] <- "[language]"
mydata$eh_s10q53[1924] <- "Hospitalization of [name]"
mydata$eh_s10q53[1937] <- "[amount redacted]"
mydata$eh_s10q53[2007] <- "License"
mydata$eh_s10q53[2023] <- "[name]'s birthday"
mydata$eh_s10q53[2074] <- "Birthday and anniversary [date]"
mydata$eh_s10q53[2078] <- "[language]"
mydata$eh_s10q53[2126] <- "[amount redacted]"
mydata$eh_s10q53[2160] <- "[language]"
mydata$eh_s10q53[2181] <- "[language]"
mydata$eh_s10q53[2190] <- "[repaires]"
mydata$eh_s10q53[2200] <- "[language]"
mydata$eh_s10q53[2209] <- "[repaires] and graduation"
mydata$eh_s10q53[2256] <- "[language]"
mydata$eh_s10q53[2280] <- "[language]"
mydata$eh_s10q53[2281] <- "Hospitalization . [date]"



#'# GPS data: Displace
# !!!No GPS data

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
