#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_4" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)

#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed
# !!!No Direct PII 


#'# Direct PII-team: Encode field team names
# !!!No Direct PII - team


#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No Small locations


#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$c_s4q37)[na.exclude(mydata$c_s4q37)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="c_s4q37", break_point=percentile_99.5, missing=-97)


#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("c_s4q1",
                  "c_s4q2",
                  "c_s4q3",
                  "c_s4q4",
                  "c_s4q5",
                  "c_s4q6",
                  "c_s4q7",
                  "c_s4q8",
                  "c_s4q9",
                  "c_s4q10",
                  "c_s4q11",
                  "c_s4q12",
                  "c_s4q13",
                  "c_s4q15",
                  "c_s4q16",
                  "c_s4q16extra",
                  "c_s4q17",
                  "c_s4q17extra",
                  "c_s4q18",
                  "c_s4q19",
                  "c_s4q20",
                  "c_s4q36")

capture_tables (indirect_PII)

# Recode those with very specific values. 

break_activity <- c(1,2,3,4,5,6,7,8,9,10,11,12)
labels_activity <- c("Your family dwelling"=1,
                     "Family Field"=2,
                     "Employer House"=3,
                     "Other"=4,
                     "Other"=5,
                     "Other"=6,
                     "Shop, Market, Kiosk"=7,
                     "Street"=8,
                     "Other"=9,
                     "Other (Specify)"=10,
                     "Non-Family's Field"=11,
                     "Fishing area"=12)
mydata <- ordinal_recode (variable="c_s4q16", break_points=break_activity, missing=999999, value_labels=labels_activity)

#'# Matching and crosstabulations: Run automated PII check 

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
# !!!Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("c_s4q1noresponse",
               "c_s4q2noresponse",
               "c_s4q3noresponse",
               "c_s4q4noresponse",
               "c_s4q5noresponse",
               "c_s4q6noresponse",
               "c_s4q7noresponse",
               "c_s4q8noresponse",
               "c_s4q9noresponse",
               "c_s4q10noresponse",
               "c_s4q11noresponse",
               "c_s4q12noresponse",
               "c_s4q13noresponse",
               "c_s4q14noresponse",
               "c_s4q15_other",
               "c_s4q16_other",
               "c_s4q16extranoresponse",
               "c_s4q17noresponse",
               "c_s4q17extranoresponse",
               "c_s4q18noresponse",
               "c_s4q19noresponse",
               "c_s4q20noresponse",
               "c_s4q27",
               "c_s4q21noresponse",
               "c_s4q27_other",
               "c_s4q28noresponse",
               "c_s4q35",
               "c_s4q29noresponse",
               "c_s4q36why",
               "c_s4q37noresponse")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$c_s4q14noresponse[99] <- "[language]"

mydata$c_s4q15_other[728] <- "She has no work since [date]"
mydata$c_s4q15_other[920] <- "They own [type of store] store"

mydata$c_s4q16_other[99] <- "Other"
mydata$c_s4q16_other[105] <- "Other"
mydata$c_s4q16_other[106] <- "Other"
mydata$c_s4q16_other[156] <- "Other"
mydata$c_s4q16_other[189] <- "Other"
mydata$c_s4q16_other[273] <- "Other"
mydata$c_s4q16_other[277] <- "Other"
mydata$c_s4q16_other[293] <- "Other"
mydata$c_s4q16_other[302] <- "Other"
mydata$c_s4q16_other[308] <- "Other"
mydata$c_s4q16_other[309] <- "Other"
mydata$c_s4q16_other[319] <- "Other"
mydata$c_s4q16_other[321] <- "Other"
mydata$c_s4q16_other[322] <- "Other"
mydata$c_s4q16_other[330] <- "Other"
mydata$c_s4q16_other[335] <- "Other"
mydata$c_s4q16_other[345] <- "Other"
mydata$c_s4q16_other[348] <- "Other"
mydata$c_s4q16_other[394] <- "Other"
mydata$c_s4q16_other[450] <- "Other"
mydata$c_s4q16_other[452] <- "Other"
mydata$c_s4q16_other[475] <- "Other"
mydata$c_s4q16_other[504] <- "Other"
mydata$c_s4q16_other[505] <- "Other"
mydata$c_s4q16_other[507] <- "Other"
mydata$c_s4q16_other[515] <- "Other"
mydata$c_s4q16_other[519] <- "Other"
mydata$c_s4q16_other[520] <- "Other"
mydata$c_s4q16_other[523] <- "Other"
mydata$c_s4q16_other[524] <- "Other"
mydata$c_s4q16_other[527] <- "Other"
mydata$c_s4q16_other[528] <- "Other"
mydata$c_s4q16_other[537] <- "Other"
mydata$c_s4q16_other[538] <- "Other"
mydata$c_s4q16_other[546] <- "Other"
mydata$c_s4q16_other[547] <- "Other"
mydata$c_s4q16_other[563] <- "Other"
mydata$c_s4q16_other[566] <- "Other"
mydata$c_s4q16_other[574] <- "Other"
mydata$c_s4q16_other[575] <- "Other"
mydata$c_s4q16_other[578] <- "Other"
mydata$c_s4q16_other[579] <- "Other"
mydata$c_s4q16_other[592] <- "Other"
mydata$c_s4q16_other[595] <- "Other"
mydata$c_s4q16_other[598] <- "Other"
mydata$c_s4q16_other[599] <- "Other"
mydata$c_s4q16_other[600] <- "Other"
mydata$c_s4q16_other[601] <- "Other"
mydata$c_s4q16_other[602] <- "Other"
mydata$c_s4q16_other[605] <- "Other"
mydata$c_s4q16_other[632] <- "Other"
mydata$c_s4q16_other[635] <- "Other"
mydata$c_s4q16_other[639] <- "Other"
mydata$c_s4q16_other[641] <- "Other"
mydata$c_s4q16_other[642] <- "Other"
mydata$c_s4q16_other[652] <- "Other"
mydata$c_s4q16_other[669] <- "Other"
mydata$c_s4q16_other[675] <- "Other"
mydata$c_s4q16_other[678] <- "Other"
mydata$c_s4q16_other[681] <- "Other"
mydata$c_s4q16_other[692] <- "Other"
mydata$c_s4q16_other[725] <- "Other"
mydata$c_s4q16_other[726] <- "Other"
mydata$c_s4q16_other[728] <- "Other"
mydata$c_s4q16_other[730] <- "Other"
mydata$c_s4q16_other[738] <- "Other"
mydata$c_s4q16_other[754] <- "Other"
mydata$c_s4q16_other[760] <- "Other"
mydata$c_s4q16_other[761] <- "Other"
mydata$c_s4q16_other[769] <- "Other"
mydata$c_s4q16_other[772] <- "Other"
mydata$c_s4q16_other[788] <- "Other"
mydata$c_s4q16_other[791] <- "Other"
mydata$c_s4q16_other[795] <- "Other"
mydata$c_s4q16_other[798] <- "Other"
mydata$c_s4q16_other[800] <- "Other"
mydata$c_s4q16_other[802] <- "Other"
mydata$c_s4q16_other[804] <- "Other"
mydata$c_s4q16_other[805] <- "Other"
mydata$c_s4q16_other[806] <- "Other"
mydata$c_s4q16_other[813] <- "Other"
mydata$c_s4q16_other[815] <- "Other"
mydata$c_s4q16_other[821] <- "Other"
mydata$c_s4q16_other[831] <- "Other"
mydata$c_s4q16_other[832] <- "Other"
mydata$c_s4q16_other[837] <- "Other"
mydata$c_s4q16_other[838] <- "Other"
mydata$c_s4q16_other[840] <- "Other"
mydata$c_s4q16_other[854] <- "Other"
mydata$c_s4q16_other[861] <- "Other"
mydata$c_s4q16_other[865] <- "Other"
mydata$c_s4q16_other[867] <- "Other"
mydata$c_s4q16_other[868] <- "Other"
mydata$c_s4q16_other[877] <- "Other"
mydata$c_s4q16_other[878] <- "Other"
mydata$c_s4q16_other[879] <- "Other"
mydata$c_s4q16_other[889] <- "Other"
mydata$c_s4q16_other[925] <- "Other"
mydata$c_s4q16_other[933] <- "Other"
mydata$c_s4q16_other[936] <- "Other"
mydata$c_s4q16_other[939] <- "Other"
mydata$c_s4q16_other[942] <- "Other"
mydata$c_s4q16_other[943] <- "Other"
mydata$c_s4q16_other[963] <- "Other"
mydata$c_s4q16_other[968] <- "Other"
mydata$c_s4q16_other[969] <- "Other"
mydata$c_s4q16_other[972] <- "Other"
mydata$c_s4q16_other[973] <- "Other"
mydata$c_s4q16_other[982] <- "Other"
mydata$c_s4q16_other[984] <- "Other"
mydata$c_s4q16_other[1004] <- "Other"
mydata$c_s4q16_other[1006] <- "Other"
mydata$c_s4q16_other[1008] <- "Other"
mydata$c_s4q16_other[1041] <- "Other"
mydata$c_s4q16_other[1065] <- "Other"
mydata$c_s4q16_other[1066] <- "Other"
mydata$c_s4q16_other[1075] <- "Other"
mydata$c_s4q16_other[1084] <- "Other"
mydata$c_s4q16_other[1091] <- "Other"
mydata$c_s4q16_other[1094] <- "Other"
mydata$c_s4q16_other[1097] <- "Other"
mydata$c_s4q16_other[1098] <- "Other"
mydata$c_s4q16_other[1100] <- "Other"
mydata$c_s4q16_other[1106] <- "Other"
mydata$c_s4q16_other[1128] <- "Other"
mydata$c_s4q16_other[1134] <- "Other"
mydata$c_s4q16_other[1150] <- "Other"
mydata$c_s4q16_other[1151] <- "Other"
mydata$c_s4q16_other[1172] <- "Other"
mydata$c_s4q16_other[1189] <- "Other"
mydata$c_s4q16_other[1201] <- "Other"
mydata$c_s4q16_other[1214] <- "Other"
mydata$c_s4q16_other[1216] <- "Other"
mydata$c_s4q16_other[1227] <- "Other"
mydata$c_s4q16_other[1228] <- "Other"
mydata$c_s4q16_other[1237] <- "Other"
mydata$c_s4q16_other[1239] <- "Other"
mydata$c_s4q16_other[1245] <- "Other"
mydata$c_s4q16_other[1246] <- "Other"
mydata$c_s4q16_other[1271] <- "Other"
mydata$c_s4q16_other[1274] <- "Other"
mydata$c_s4q16_other[1282] <- "Other"
mydata$c_s4q16_other[1299] <- "Other"
mydata$c_s4q16_other[1312] <- "Other"
mydata$c_s4q16_other[1378] <- "Other"
mydata$c_s4q16_other[1379] <- "Other"
mydata$c_s4q16_other[1380] <- "Other"
mydata$c_s4q16_other[1404] <- "Other"
mydata$c_s4q16_other[1411] <- "Other"
mydata$c_s4q16_other[1436] <- "Other"
mydata$c_s4q16_other[1446] <- "Other"
mydata$c_s4q16_other[1458] <- "Other"
mydata$c_s4q16_other[1466] <- "Other"
mydata$c_s4q16_other[1485] <- "Other"
mydata$c_s4q16_other[1497] <- "Other"
mydata$c_s4q16_other[1505] <- "Other"
mydata$c_s4q16_other[1507] <- "Other"
mydata$c_s4q16_other[1508] <- "Other"
mydata$c_s4q16_other[1530] <- "Other"
mydata$c_s4q16_other[1531] <- "Other"
mydata$c_s4q16_other[1541] <- "Other"
mydata$c_s4q16_other[1543] <- "Other"
mydata$c_s4q16_other[1556] <- "Other"
mydata$c_s4q16_other[1567] <- "Other"
mydata$c_s4q16_other[1571] <- "Other"
mydata$c_s4q16_other[1574] <- "Other"
mydata$c_s4q16_other[1577] <- "Other"
mydata$c_s4q16_other[1585] <- "Other"
mydata$c_s4q16_other[1617] <- "Other"
mydata$c_s4q16_other[1618] <- "Other"
mydata$c_s4q16_other[1648] <- "Other"
mydata$c_s4q16_other[1666] <- "Other"
mydata$c_s4q16_other[1668] <- "Other"
mydata$c_s4q16_other[1670] <- "Other"
mydata$c_s4q16_other[1723] <- "Other"
mydata$c_s4q16_other[1725] <- "Other"
mydata$c_s4q16_other[1756] <- "Other"
mydata$c_s4q16_other[1757] <- "Other"
mydata$c_s4q16_other[1773] <- "Other"
mydata$c_s4q16_other[1779] <- "Other"
mydata$c_s4q16_other[1788] <- "Other"
mydata$c_s4q16_other[1789] <- "Other"
mydata$c_s4q16_other[1792] <- "Other"
mydata$c_s4q16_other[1806] <- "Other"
mydata$c_s4q16_other[1809] <- "Other"
mydata$c_s4q16_other[1828] <- "Other"
mydata$c_s4q16_other[1829] <- "Other"
mydata$c_s4q16_other[1830] <- "Other"
mydata$c_s4q16_other[1836] <- "Other"
mydata$c_s4q16_other[1839] <- "Other"
mydata$c_s4q16_other[1842] <- "Other"
mydata$c_s4q16_other[1843] <- "Other"
mydata$c_s4q16_other[1845] <- "Other"
mydata$c_s4q16_other[1848] <- "Other"
mydata$c_s4q16_other[1852] <- "Other"
mydata$c_s4q16_other[1855] <- "Other"
mydata$c_s4q16_other[1857] <- "Other"
mydata$c_s4q16_other[1859] <- "Other"
mydata$c_s4q16_other[1866] <- "Other"
mydata$c_s4q16_other[1867] <- "Other"
mydata$c_s4q16_other[1872] <- "Other"
mydata$c_s4q16_other[1881] <- "Other"
mydata$c_s4q16_other[1882] <- "Other"
mydata$c_s4q16_other[1892] <- "Other"
mydata$c_s4q16_other[1896] <- "Other"
mydata$c_s4q16_other[1908] <- "Other"
mydata$c_s4q16_other[1911] <- "Other"
mydata$c_s4q16_other[1913] <- "Other"
mydata$c_s4q16_other[1917] <- "Other"
mydata$c_s4q16_other[1918] <- "Other"
mydata$c_s4q16_other[1919] <- "Other"
mydata$c_s4q16_other[1922] <- "Other"
mydata$c_s4q16_other[1939] <- "Other"
mydata$c_s4q16_other[1956] <- "Other"
mydata$c_s4q16_other[1960] <- "Other"
mydata$c_s4q16_other[1961] <- "Other"
mydata$c_s4q16_other[1964] <- "Other"
mydata$c_s4q16_other[1974] <- "Other"
mydata$c_s4q16_other[1976] <- "Other"
mydata$c_s4q16_other[1978] <- "Other"
mydata$c_s4q16_other[1979] <- "Other"
mydata$c_s4q16_other[1981] <- "Other"
mydata$c_s4q16_other[1984] <- "Other"
mydata$c_s4q16_other[1986] <- "Other"
mydata$c_s4q16_other[1990] <- "Other"
mydata$c_s4q16_other[1993] <- "Other"
mydata$c_s4q16_other[1995] <- "Other"
mydata$c_s4q16_other[1996] <- "Other"
mydata$c_s4q16_other[1997] <- "Other"
mydata$c_s4q16_other[2004] <- "Other"
mydata$c_s4q16_other[2006] <- "Other"
mydata$c_s4q16_other[2007] <- "Other"
mydata$c_s4q16_other[2015] <- "Other"
mydata$c_s4q16_other[2023] <- "Other"
mydata$c_s4q16_other[2033] <- "Other"
mydata$c_s4q16_other[2040] <- "Other"
mydata$c_s4q16_other[2041] <- "Other"
mydata$c_s4q16_other[2042] <- "Other"
mydata$c_s4q16_other[2100] <- "Other"
mydata$c_s4q16_other[2102] <- "Other"
mydata$c_s4q16_other[2103] <- "Other"
mydata$c_s4q16_other[2104] <- "Other"
mydata$c_s4q16_other[2108] <- "Other"
mydata$c_s4q16_other[2111] <- "Other"
mydata$c_s4q16_other[2112] <- "Other"
mydata$c_s4q16_other[2113] <- "Other"
mydata$c_s4q16_other[2115] <- "Other"
mydata$c_s4q16_other[2132] <- "Other"
mydata$c_s4q16_other[2133] <- "Other"
mydata$c_s4q16_other[2143] <- "Other"
mydata$c_s4q16_other[2144] <- "Other"
mydata$c_s4q16_other[2150] <- "Other"
mydata$c_s4q16_other[2168] <- "Other"
mydata$c_s4q16_other[2169] <- "Other"
mydata$c_s4q16_other[2223] <- "Other"
mydata$c_s4q16_other[2259] <- "Other"
mydata$c_s4q16_other[2260] <- "Other"
mydata$c_s4q16_other[2277] <- "Other"
mydata$c_s4q16_other[2283] <- "Other"
mydata$c_s4q16_other[2298] <- "Other"
mydata$c_s4q16_other[2299] <- "Other"
mydata$c_s4q16_other[2300] <- "Other"
mydata$c_s4q16_other[2309] <- "Other"

mydata$c_s4q17noresponse[100] <- "[language]"

mydata$c_s4q19noresponse[100] <- "[language]"

mydata$c_s4q20noresponse[100] <- "[language]"

mydata$c_s4q27[99] <- "[language]"
mydata$c_s4q27[118] <- "[language]"
mydata$c_s4q27[378] <- "[language]"
mydata$c_s4q27[526] <- "[language]"
mydata$c_s4q27[564] <- "[language]"
mydata$c_s4q27[579] <- "[language]"
mydata$c_s4q27[1120] <- "[language]"
mydata$c_s4q27[1425] <- "[language]"
mydata$c_s4q27[1455] <- "[language]"
mydata$c_s4q27[1680] <- "[language]"

mydata$c_s4q29noresponse[100] <- "[language]"
mydata$c_s4q29noresponse[874] <- "[language]"

mydata$c_s4q35[99] <- "[language]"
mydata$c_s4q35[2068] <- "She once cleaned the house of her grandmother for [amount of money] over the past year."


#'# GPS data: Displace
# !!!No GPS data

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
