#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_9" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)

#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed
# !!!No Direct PII

#'# Direct PII-team: Encode field team names
# !!!No Direct PII-team

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No Small Locations

#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q2)[na.exclude(mydata$eh_s9q2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q7)[na.exclude(mydata$eh_s9q7)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q7", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q8)[na.exclude(mydata$eh_s9q8)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q8", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q9)[na.exclude(mydata$eh_s9q9)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q9", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q10)[na.exclude(mydata$eh_s9q10)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q10", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q12)[na.exclude(mydata$eh_s9q12)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q12", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q13)[na.exclude(mydata$eh_s9q13)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q13", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q14)[na.exclude(mydata$eh_s9q14)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q14", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q15)[na.exclude(mydata$eh_s9q15)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q15", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q16)[na.exclude(mydata$eh_s9q16)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q16", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q17)[na.exclude(mydata$eh_s9q17)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q17", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q18)[na.exclude(mydata$eh_s9q18)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q18", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q19)[na.exclude(mydata$eh_s9q19)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q19", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q20)[na.exclude(mydata$eh_s9q20)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q20", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q21)[na.exclude(mydata$eh_s9q21)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q21", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q22)[na.exclude(mydata$eh_s9q22)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q22", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q23)[na.exclude(mydata$eh_s9q23)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q23", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q24)[na.exclude(mydata$eh_s9q24)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q24", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q25)[na.exclude(mydata$eh_s9q25)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q25", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q26)[na.exclude(mydata$eh_s9q26)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q26", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q28)[na.exclude(mydata$eh_s9q28)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q28", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q29)[na.exclude(mydata$eh_s9q29)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q29", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q30)[na.exclude(mydata$eh_s9q30)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q30", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q31)[na.exclude(mydata$eh_s9q31)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q31", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q32)[na.exclude(mydata$eh_s9q32)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q32", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q33)[na.exclude(mydata$eh_s9q33)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q33", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q34)[na.exclude(mydata$eh_s9q34)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q34", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q35)[na.exclude(mydata$eh_s9q35)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q35", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q36)[na.exclude(mydata$eh_s9q36)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q36", break_point=percentile_99.5, missing=999999)

mydata <- top_recode (variable="eh_s9q37", break_point=1, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q38)[na.exclude(mydata$eh_s9q38)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q38", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q39)[na.exclude(mydata$eh_s9q39)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q39", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q40)[na.exclude(mydata$eh_s9q40)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q40", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q41)[na.exclude(mydata$eh_s9q41)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q41", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q42)[na.exclude(mydata$eh_s9q42)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q42", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q43)[na.exclude(mydata$eh_s9q43)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q43", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q44)[na.exclude(mydata$eh_s9q44)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q44", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q45)[na.exclude(mydata$eh_s9q45)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q45", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q46)[na.exclude(mydata$eh_s9q46)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q46", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q47)[na.exclude(mydata$eh_s9q47)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q47", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q48)[na.exclude(mydata$eh_s9q48)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q48", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q50)[na.exclude(mydata$eh_s9q50)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q50", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q51)[na.exclude(mydata$eh_s9q51)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q51", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q52)[na.exclude(mydata$eh_s9q52)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q52", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q53)[na.exclude(mydata$eh_s9q53)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q53", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q54)[na.exclude(mydata$eh_s9q54)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q54", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q55)[na.exclude(mydata$eh_s9q55)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q55", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q56)[na.exclude(mydata$eh_s9q56)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q56", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q57)[na.exclude(mydata$eh_s9q57)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q57", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q58)[na.exclude(mydata$eh_s9q58)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q58", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q59)[na.exclude(mydata$eh_s9q59)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q59", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q60)[na.exclude(mydata$eh_s9q60)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q60", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q61)[na.exclude(mydata$eh_s9q61)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q61", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q62)[na.exclude(mydata$eh_s9q62)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q62", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q63)[na.exclude(mydata$eh_s9q63)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q63", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q64)[na.exclude(mydata$eh_s9q64)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q64", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q65)[na.exclude(mydata$eh_s9q65)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q65", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q66)[na.exclude(mydata$eh_s9q66)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q66", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q67)[na.exclude(mydata$eh_s9q67)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q67", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q68)[na.exclude(mydata$eh_s9q68)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q68", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q69)[na.exclude(mydata$eh_s9q69)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q69", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q70)[na.exclude(mydata$eh_s9q70)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q70", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q71)[na.exclude(mydata$eh_s9q71)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q71", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q72)[na.exclude(mydata$eh_s9q72)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q72", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q73)[na.exclude(mydata$eh_s9q73)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q73", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q74)[na.exclude(mydata$eh_s9q74)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q74", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q75)[na.exclude(mydata$eh_s9q75)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q75", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q76)[na.exclude(mydata$eh_s9q76)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q76", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$eh_s9q77)[na.exclude(mydata$eh_s9q77)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="eh_s9q77", break_point=percentile_99.5, missing=999999)

#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("eh_s9q1",
                  "eh_s9q4",
                  "eh_s9q5",
                  "eh_s9q6")

capture_tables (indirect_PII)


#'# Matching and crosstabulations: Run automated PII check 
# !!!Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!!No Open-ends


#'# GPS data: Displace
# !!!No GPS data

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
