#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup filenames

filename <- "InDepthStudents2016_Rural_Raw_NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#' #Setup data, functions and create dictionary for dataset review
source (functions_vers)

#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables

#' #Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("nomest",
              "apest",
              "amest",
              "nompad",
              "app_pad",
              "nommad",
              "app_mad",
              "p1a1_fixed",
              "p1a2",
              "p1a2_fixed",
              "p1a3",
              "p1a3_fixed",
              "p1a4",
              "p1a4_fixed",
              "address",
              "referencia",
              "audio1_student",
              "audio2_student",
              "audio3_student",
              "text_audit",
              "cto_padre",
              "cto_padre_nom",
              "cto_padre_app1",
              "cto_padre_app2",
              "audio_random",
              "key") 
mydata <- mydata[!names(mydata) %in% dropvars]

#' #Direct PII-team: Encode field team names
#  Interviewer names, for example  may be useful for analysis of interviewer effects
#' !!!Replace vector in "variables" field below with relevant variable names

mydata <- mydata[!names(mydata) %in% "i5"]

#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("i8a",
             "i7",
             "i9a1",
             "cod_mod",
             "school_fixed_primary",
             "school_fixed_sec",
             "nom_dist",
             "district_fixed",
             "p12",
             "school2014_name1",
             "school2013_name1") 
mydata <- encode_location (variables= locvars, missing=999999)



# !!!Remove as contain identifying information

dropvars <- c("i9a",
              "school2016",
              "school_fixed",
              "p11b",
              "school2014_name",
              "school2013_name",
              "district_fixed1",
              "centro_poblado")
mydata <- mydata[!names(mydata) %in% dropvars]



#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# !!!Remove as contain identifying information

dropvars <- c("i15",
              "i16",
              "i16a",
              "i18_fixed")
mydata <- mydata[!names(mydata) %in% dropvars]


# Top code days absent from school (5 or more)

mydata2 <- top_recode ("p12b", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 



#' #Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("i10",
                  "i12",
                  "i23",
                  "dropout",
                  "school_fixed_level",
                  "do_grade2015_fixed",
                  "do_approved_grade2015",
                  "do_2015_fixed",
                  "dropout_approved_fixed",
                  "p1a",
                  "p1a_1",
                  "p1a_2",
                  "p1a_3",
                  "dout_reasons",
                  "dout_reasons_2",
                  "dout_reasons_3",
                  "dout_reasons_4",
                  "dout_reasons_5",
                  "dout_reasons_6",
                  "dout_reasons_7",
                  "dout_reasons_8",
                  "dout_reasons_9",
                  "dout_reasons_10",
                  "dout_reasons_11",
                  "dout_reasons_12",
                  "dout_reasons_13",
                  "dout_reasons_14",
                  "dout_reasons_98",
                  "dout_reasons_99",
                  "dout_decision",
                  "p20",
                  "same_school2015",
                  "switcher_2016",
                  "asissted_2014",
                  "same_school2014",
                  "switcher_2015",
                  "dout2014",
                  "dout2014_1",
                  "dout2014_2",
                  "dout2014_3",
                  "dout2014_4",
                  "dout2014_5",
                  "dout2014_6",
                  "dout2014_7",
                  "dout2014_8",
                  "dout2014_9",
                  "dout2014_10",
                  "dout2014_11",
                  "dout2014_12",
                  "dout2014_13",
                  "dout2014_14",
                  "dout2014_98",
                  "dout2014_99",
                  "asissted_2013",
                  "same_school2013",
                  "switcher_2014",
                  "dout2013",
                  "dout2013_1",
                  "dout2013_2",
                  "dout2013_3",
                  "dout2013_4",
                  "dout2013_5",
                  "dout2013_6",
                  "dout2013_7",
                  "dout2013_8",
                  "dout2013_9",
                  "dout2013_10",
                  "dout2013_11",
                  "dout2013_12",
                  "dout2013_13",
                  "dout2013_14",
                  "dout2013_98",
                  "dout2013_99",
                  "a2",
                  "a2b",
                  "a2c",
                  "a2d",
                  "a3",
                  "a3b",
                  "a3c",
                  "a3d",
                  "a4",
                  "a4b",
                  "a4c",
                  "a4d",
                  "a5",
                  "a5b",
                  "a5c",
                  "a5d",
                  "a6",
                  "a6b",
                  "a6c",
                  "a6d",
                  "a7",
                  "a7b",
                  "a7c",
                  "a7d",
                  "a8",
                  "a8b",
                  "a8c",
                  "a8d",
                  "a9",
                  "a9b",
                  "a9c",
                  "a9d",
                  "a10",
                  "a10b",
                  "a10c",
                  "a10d",
                  "a11",
                  "a11b",
                  "a11c",
                  "a11d",
                  "m2",
                  "m2b",
                  "m2c",
                  "m2d",
                  "m3",
                  "m3b",
                  "m3c",
                  "m3d",
                  "m4",
                  "m4b",
                  "m4c",
                  "m4d",
                  "m5",
                  "m5b",
                  "m5c",
                  "m5d",
                  "m6",
                  "m6b",
                  "m6c",
                  "m6d",
                  "m7",
                  "m7b",
                  "m7c",
                  "m7d",
                  "m8",
                  "m8b",
                  "m8c",
                  "m8d",
                  "m9",
                  "m9b",
                  "m9c",
                  "m9d",
                  "m10",
                  "m10b",
                  "m10c",
                  "m10d",
                  "m11",
                  "m11b",
                  "m11c",
                  "m11d",
                  "p22a",
                  "p22b",
                  "p25_note",
                  "p25a1",
                  "p25a2",
                  "p25a3",
                  "p25b",
                  "p25c",
                  "p25d",
                  "p25e",
                  "p25_1_note1",
                  "p25_1a",
                  "p25_1b",
                  "p25_1c",
                  "p25_1d",
                  "p25_1e",
                  "p25_1f",
                  "p25_2g",
                  "p25_3h",
                  "p25_4i",
                  "p25_5j",
                  "p25_6k",
                  "p25_7l",
                  "p25_8m",
                  "p25_9n",
                  "p25_10o",
                  "p25_11p",
                  "p25_12q",
                  "p25_13r",
                  "p25_14s",
                  "p25_14t",
                  "p25_2_note",
                  "p25_2a",
                  "p25_2b",
                  "p25_2c",
                  "p25_2d",
                  "p25_2e",
                  "p25_2f",
                  "p25_2g1",
                  "p25_2h",
                  "p25_2i",
                  "p27_note",
                  "p27a",
                  "p27b",
                  "p27c",
                  "p27d",
                  "p27e")

capture_tables (indirect_PII)


# Recode those with very specific values. 

mydata$p1a[mydata$p1a == "1 3"] <- "Otros"
mydata$p1a[mydata$p1a == "2 3"] <- "Otros"


break_activity <- c(-98,1,2,3,4,99)
labels_activity <- c("No se"=1,
                     "Porque mi escuela anterior no tenia nivel secundario"=2,
                     "Otro"=3,
                     "Otro"=4,
                     "Porque mi nueva escuela es mejor que mi antigua escuela"=5,
                     "Otro"=6)
mydata <- ordinal_recode (variable="switcher_2016", break_points=break_activity, missing=999999, value_labels=labels_activity)



labels_activity <- c("No se"=1,
                     "Porque mi escuela anterior no tenia nivel secundario"=2,
                     "Otro"=3,
                     "Otro"=4,
                     "Porque mi nueva escuela es mejor que mi antigua escuela"=5,
                     "Otro"=6)
mydata <- ordinal_recode (variable="switcher_2015", break_points=break_activity, missing=999999, value_labels=labels_activity)



labels_activity <- c("No se"=1,
                     "Porque mi escuela anterior no tenia nivel secundario"=2,
                     "Porque mi nueva escuela esta mas cerca de mi casa"=3,
                     "Otro"=4,
                     "Porque mi nueva escuela es mejor que mi antigua escuela"=5,
                     "Otro"=6)
mydata <- ordinal_recode (variable="switcher_2014", break_points=break_activity, missing=999999, value_labels=labels_activity)


#' #Matching and crosstabulations: Run automated PII check 
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('i17', 'grado','i12') ##!!! Replace with candidate categorical demo vars

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial


#' Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] #manipulated variables HH
mydata [notAnon,"i12"] <- NA
mydata [notAnon,"grado"] <- NA

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("dout_reasons_1",
"v108",
"p13c1",
"p35a",
"p35a1",
"p35b",
"p35b1",
"switcher_2016_otro",
"switcher_2015_otro",
"dout2014_otro",
"switcher_2014_otro",
"dout2013_otro",
"a2_o",
"a2g",
"a3_o",
"a3g",
"a4_o",
"a4g",
"a5_o",
"a5g",
"a6_o",
"a6g",
"a7_o",
"a7g",
"a8_o",
"a8g",
"a9_o",
"a9g",
"a10_o",
"a10g",
"a11_o",
"a11g",
"m2_o",
"m2g",
"m3_o",
"m3g",
"m4_o",
"m4g",
"m5_o",
"m5g",
"m6_o",
"m6g",
"m7_o",
"m7g",
"m8_o",
"m8g",
"m9_o",
"m9g",
"m10_o",
"m10g",
"m11_o",
"m11g",
"q48",
"p1a")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!! Remove, as they contain a lot of sensitive information and they are in Spanish.

mydata <- mydata[!names(mydata) %in% "dout_reasons_1"]
mydata <- mydata[!names(mydata) %in% "v108"]
mydata <- mydata[!names(mydata) %in% "p13c1"]
mydata <- mydata[!names(mydata) %in% "p35a"]
mydata <- mydata[!names(mydata) %in% "p35a1"]
mydata <- mydata[!names(mydata) %in% "p35b"]
mydata <- mydata[!names(mydata) %in% "p35b1"]
mydata <- mydata[!names(mydata) %in% "switcher_2016_otro"]
mydata <- mydata[!names(mydata) %in% "switcher_2015_otro"]
mydata <- mydata[!names(mydata) %in% "dout2014_otro"]
mydata <- mydata[!names(mydata) %in% "switcher_2014_otro"]
mydata <- mydata[!names(mydata) %in% "dout2013_otro"]
mydata <- mydata[!names(mydata) %in% "a2_o"]
mydata <- mydata[!names(mydata) %in% "a2g"]
mydata <- mydata[!names(mydata) %in% "a3_o"]
mydata <- mydata[!names(mydata) %in% "a3g"]
mydata <- mydata[!names(mydata) %in% "a4_o"]
mydata <- mydata[!names(mydata) %in% "a4g"]
mydata <- mydata[!names(mydata) %in% "a5_o"]
mydata <- mydata[!names(mydata) %in% "a5g"]
mydata <- mydata[!names(mydata) %in% "a6_o"]
mydata <- mydata[!names(mydata) %in% "a6g"]
mydata <- mydata[!names(mydata) %in% "a7_o"]
mydata <- mydata[!names(mydata) %in% "a7g"]
mydata <- mydata[!names(mydata) %in% "a8_o"]
mydata <- mydata[!names(mydata) %in% "a8g"]
mydata <- mydata[!names(mydata) %in% "a9_o"]
mydata <- mydata[!names(mydata) %in% "a9g"]
mydata <- mydata[!names(mydata) %in% "a10_o"]
mydata <- mydata[!names(mydata) %in% "a10g"]
mydata <- mydata[!names(mydata) %in% "a11_o"]
mydata <- mydata[!names(mydata) %in% "a11g"]
mydata <- mydata[!names(mydata) %in% "m2_o"]
mydata <- mydata[!names(mydata) %in% "m2g"]
mydata <- mydata[!names(mydata) %in% "m3_o"]
mydata <- mydata[!names(mydata) %in% "m3g"]
mydata <- mydata[!names(mydata) %in% "m4_o"]
mydata <- mydata[!names(mydata) %in% "m4g"]
mydata <- mydata[!names(mydata) %in% "m5_o"]
mydata <- mydata[!names(mydata) %in% "m5g"]
mydata <- mydata[!names(mydata) %in% "m6_o"]
mydata <- mydata[!names(mydata) %in% "m6g"]
mydata <- mydata[!names(mydata) %in% "m7_o"]
mydata <- mydata[!names(mydata) %in% "m7g"]
mydata <- mydata[!names(mydata) %in% "m8_o"]
mydata <- mydata[!names(mydata) %in% "m8g"]
mydata <- mydata[!names(mydata) %in% "m9_o"]
mydata <- mydata[!names(mydata) %in% "m9g"]
mydata <- mydata[!names(mydata) %in% "m10_o"]
mydata <- mydata[!names(mydata) %in% "m10g"]
mydata <- mydata[!names(mydata) %in% "m11_o"]
mydata <- mydata[!names(mydata) %in% "m11g"]
mydata <- mydata[!names(mydata) %in% "q48"]


#' #GPS data: Displace
#' # !!! No GPS data


#' #Save processed data in Stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
