#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_4" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)
#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("eh_s4q3") 
mydata <- mydata[!names(mydata) %in% dropvars]

#'# Direct PII-team: Encode field team names
# !!!No Direct PII - team


#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No Small locations


#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# Recode education attainment of adults to reduce risk of re-identification 

haven_table("eh_s4q30")
haven_table("eh_s4q41")

break_edu <- c(-998,	1,	2,	3,	4,	5,	6,	7,	8,	9,	10,	13,	15,	16,	17,	18,	19,	21, 25,	96)
labels_edu <- c("-998"=1,
                "1"=2,
                "2"=3,
                "3"=4,
                "4"=5,
                "5"=6,
                "6"=7,
                "7"=8,
                "8"=9,
                "9"=10,
                "10 or 11 or 12"=11,
                "13 or 14"=12,
                "15"=13,
                "16"=14,
                "17"=15,
                "18"=16,
                "19 or 20"=17,
                "21 or 22 or 24"=18,
                "25"=19,
                "96"=20)
mydata <- ordinal_recode (variable="eh_s4q30", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-998,	1,	2,	3,	4,	5,	6,	7,	8,	9, 13,	14,	15,	16,	17,	18,	19,	21, 25,	96)
labels_edu <- c("-998"=1,
                "1"=2,
                "2"=3,
                "3"=4,
                "4"=5,
                "5"=6,
                "6"=7,
                "7"=8,
                "8"=9,
                "9 or 10 or 11 or 12"=10,
                "13"=11,
                "14"=12,
                "15"=13,
                "16"=14,
                "17"=15,
                "18"=16,
                "19 or 20"=17,
                "21 or 24"=18,
                "25"=19,
                "96"=20)
mydata <- ordinal_recode (variable="eh_s4q41", break_points=break_edu, missing=999999, value_labels=labels_edu)


#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("eh_s4q4",
                  "eh_s4q32",
                  "eh_s4q33",
                  "eh_s4q43")

capture_tables (indirect_PII)

# Recode those with very specific values.

break_ocup <- c(-998,	1,	2,	3,	4,	5,	6,	7,	8,	9,	10,	12,	14,	22,	33,	34,	35,	39,	44,	62,	64,	65,	67,	68,	73,	75,	76,	77,	99)
labels_ocup <- c("Don't know"=1,
                 "Street and related sales and service workers"=2,
                 "Street and related sales and service workers"=3,
                 "Street and related sales and service workers"=4,
                 "Street and related sales and service workers"=5,
                 "Street and related sales and service workers"=6,
                 "Street and related sales and service workers"=7,
                 "Personal care workers"=8,
                 "Cleaners and helpers"=9,
                 "Cleaners and helpers"=10,
                 "Cleaners and helpers"=11,
                 "Food processing, wood working, garment and other craft and related trades workers"=12,
                 "Food processing, wood working, garment and other craft and related trades workers"=13,
                 "Agricultural, forestry and fishery labourers"=14,
                 "Agricultural, forestry and fishery labourers"=15,
                 "Agricultural, forestry and fishery labourers"=16,
                 "Agricultural, forestry and fishery labourers"=17,
                 "Agricultural, forestry and fishery labourers"=18,
                 "Food preparation assistants"=19,
                 "Refuse workers and other elementary workers"=20,
                 "Street and related sales and service workers"=21,
                 "Customer services clerks"=22,
                 "Personal service workers"=23,
                 "Electrical and electronic trades workers"=24,
                 "Food processing, wood working, garment and other craft and related trades workers"=25,
                 "Student"=26,
                 "Cleaners and helpers"=27,
                 "Street and related sales and service workers"=28,
                 "Other: Specify "=29)
mydata <- ordinal_recode (variable="eh_s4q33", break_points=break_ocup, missing=999999, value_labels=labels_ocup)




#'# Matching and crosstabulations: Run automated PII check 
# !!!Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("eh_s4q6",
               "eh_s4q10",
               "eh_s4q12",
               "eh_s4q14",
               "eh_s4q22",
               "eh_s4q24",
               "eh_s4q31",
               "eh_s4q34",
               "eh_s4q37",
               "eh_s4q42",
               "eh_s4q45")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$eh_s4q34[35] <- "Cleaners and helpers"
mydata$eh_s4q34[72] <- "Cleaners and helpers"
mydata$eh_s4q34[114] <- "Personal care workers"
mydata$eh_s4q34[115] <- "Personal care workers"
mydata$eh_s4q34[133] <- "Personal service workers"
mydata$eh_s4q34[134] <- "Personal service workers"
mydata$eh_s4q34[135] <- "Personal service workers"
mydata$eh_s4q34[136] <- "Personal service workers"
mydata$eh_s4q34[157] <- "Protective services workers"
mydata$eh_s4q34[158] <- "Protective services workers"
mydata$eh_s4q34[361] <- "Street and related sales and service workers"
mydata$eh_s4q34[553] <- "Street and related sales and service workers"
mydata$eh_s4q34[601] <- "Business and administration associate professionals"
mydata$eh_s4q34[833] <- "Cleaners and helpers"
mydata$eh_s4q34[834] <- "Cleaners and helpers"
mydata$eh_s4q34[883] <- "Other"
mydata$eh_s4q34[884] <- "Other"
mydata$eh_s4q34[1047] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$eh_s4q34[1049] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$eh_s4q34[1203] <- "Personal care workers"
mydata$eh_s4q34[1204] <- "Personal care workers"
mydata$eh_s4q34[1252] <- "Personal care workers"
mydata$eh_s4q34[1264] <- "Personal care workers"
mydata$eh_s4q34[1265] <- "Personal care workers"
mydata$eh_s4q34[1266] <- "Personal care workers"
mydata$eh_s4q34[1361] <- "Other"
mydata$eh_s4q34[1392] <- "Cleaners and helpers"
mydata$eh_s4q34[1416] <- "Personal care workers"
mydata$eh_s4q34[1417] <- "Personal care workers"
mydata$eh_s4q34[1422] <- "Cleaners and helpers"
mydata$eh_s4q34[1487] <- "Cleaners and helpers"
mydata$eh_s4q34[1488] <- "Cleaners and helpers"
mydata$eh_s4q34[1527] <- "Personal care workers"
mydata$eh_s4q34[1528] <- "Personal care workers"
mydata$eh_s4q34[1677] <- "Cleaners and helpers"
mydata$eh_s4q34[1702] <- "Sales workers"
mydata$eh_s4q34[1713] <- "Cleaners and helpers"
mydata$eh_s4q34[1716] <- "Cleaners and helpers"
mydata$eh_s4q34[1804] <- "Other"
mydata$eh_s4q34[1805] <- "Other"
mydata$eh_s4q34[1839] <- "Other"
mydata$eh_s4q34[1840] <- "Other"
mydata$eh_s4q34[1841] <- "Other"
mydata$eh_s4q34[1842] <- "Other"
mydata$eh_s4q34[1867] <- "Teaching professionals"
mydata$eh_s4q34[1909] <- "[language]"
mydata$eh_s4q34[1951] <- "other"
mydata$eh_s4q34[1952] <- "other"
mydata$eh_s4q34[1953] <- "[language]"
mydata$eh_s4q34[2019] <- "Other"
mydata$eh_s4q34[2072] <- "Personal care workers"
mydata$eh_s4q34[2073] <- "Personal care workers"
mydata$eh_s4q34[2074] <- "Personal care workers"
mydata$eh_s4q34[2075] <- "Personal care workers"
mydata$eh_s4q34[2102] <- "Other"
mydata$eh_s4q34[2103] <- "Other"
mydata$eh_s4q34[2112] <- "Other"
mydata$eh_s4q34[2174] <- "[language]"
mydata$eh_s4q34[2239] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[2380] <- "Electrical and electronic trades workers"
mydata$eh_s4q34[2581] <- "Other"
mydata$eh_s4q34[2592] <- "Business and administration associate professionals"
mydata$eh_s4q34[2637] <- "Cleaners and helpers"
mydata$eh_s4q34[2644] <- "Cleaners and helpers"
mydata$eh_s4q34[2697] <- "Cleaners and helpers"
mydata$eh_s4q34[2724] <- "Cleaners and helpers"
mydata$eh_s4q34[2725] <- "Cleaners and helpers"
mydata$eh_s4q34[2834] <- "Cleaners and helpers"
mydata$eh_s4q34[2835] <- "Cleaners and helpers"
mydata$eh_s4q34[2836] <- "Cleaners and helpers"
mydata$eh_s4q34[2878] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[2889] <- "other"
mydata$eh_s4q34[2907] <- "Customer services clerks"
mydata$eh_s4q34[2910] <- "Cleaners and helpers"
mydata$eh_s4q34[2911] <- "Cleaners and helpers"
mydata$eh_s4q34[2912] <- "Cleaners and helpers"
mydata$eh_s4q34[2913] <- "Cleaners and helpers"
mydata$eh_s4q34[2914] <- "Cleaners and helpers"
mydata$eh_s4q34[2915] <- "Cleaners and helpers"
mydata$eh_s4q34[2916] <- "Cleaners and helpers"
mydata$eh_s4q34[2957] <- "Other"
mydata$eh_s4q34[2994] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[3050] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[3085] <- "other"
mydata$eh_s4q34[3086] <- "other"
mydata$eh_s4q34[3098] <- "Personal care workers"
mydata$eh_s4q34[3151] <- "Cleaners and helpers"
mydata$eh_s4q34[3152] <- "Cleaners and helpers"
mydata$eh_s4q34[3164] <- "Cleaners and helpers"
mydata$eh_s4q34[3165] <- "Cleaners and helpers"
mydata$eh_s4q34[3195] <- "Other"
mydata$eh_s4q34[3220] <- "Electrical and electronic trades workers"
mydata$eh_s4q34[3221] <- "Electrical and electronic trades workers"
mydata$eh_s4q34[3222] <- "Electrical and electronic trades workers"
mydata$eh_s4q34[3228] <- "Other"
mydata$eh_s4q34[3578] <- "Other"
mydata$eh_s4q34[3582] <- "Street and related sales and service workers"
mydata$eh_s4q34[3766] <- "other"
mydata$eh_s4q34[3767] <- "other"
mydata$eh_s4q34[3769] <- "other"
mydata$eh_s4q34[4137] <- "Personal care workers"
mydata$eh_s4q34[4139] <- "Other"
mydata$eh_s4q34[4141] <- "Other"
mydata$eh_s4q34[4595] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[4786] <- "Personal care workers"
mydata$eh_s4q34[4916] <- "Other"
mydata$eh_s4q34[5050] <- "Cleaners and helpers"
mydata$eh_s4q34[5051] <- "Cleaners and helpers"
mydata$eh_s4q34[5060] <- "Other"
mydata$eh_s4q34[5070] <- "Cleaners and helpers"
mydata$eh_s4q34[5071] <- "Cleaners and helpers"
mydata$eh_s4q34[5072] <- "Cleaners and helpers"
mydata$eh_s4q34[5135] <- "Labourers in mining, construction, manufacturing and transport"
mydata$eh_s4q34[5162] <- "Personal service workers"
mydata$eh_s4q34[5163] <- "Personal service workers"
mydata$eh_s4q34[5179] <- "Personal care workers"
mydata$eh_s4q34[5180] <- "Personal care workers"
mydata$eh_s4q34[5181] <- "Personal care workers"
mydata$eh_s4q34[5244] <- "other"
mydata$eh_s4q34[5250] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$eh_s4q34[5278] <- "Teaching professionals"
mydata$eh_s4q34[5723] <- "Personal care workers"
mydata$eh_s4q34[5743] <- "Electrical and electronic trades workers"

#'# GPS data: Displace
# !!!No GPS data


#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
