#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "IDT_raw_students_NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)



#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("student_name",
              "student_fullname",
              "no_guardian_name",
              "consent_signature",
              "consent_signature_paper",
              "guard_name",
              "guard_app",
              "guard_apm",
              "guard_nn",
              "p4a",
              "audio1_student",
              "audio2_student",
              "audio3_student",
              "ss_phone",
              "ss_gps_whereother",
              "ss_photo",
              "random_audio_ss",
              "grado2016_admin_name") 
mydata <- mydata[!names(mydata) %in% dropvars]

#'# Direct PII-team: Encode field team names
#  No Direct PII-team


#'# Small locations: Encode locations  with pop <100,000 using random large numbers
#  !!!Include relevant variables, but check their population size first to confirm they are <100,000

# !!! Removed as it contains identifying information

locvars <- c("nombre_colegio",
             "prompt_cole_name",
             "cole2016_correct",
             "cole2016_new",
             "school2015_name1",
             "school2014_name1",
             "school2013_name1",
             "school2012_name1",
             "school2011_name1",
             "school2010_name1",
             "ss_gps_where") 
mydata <- mydata[!names(mydata) %in% locvars]

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("cod_mod2016_admin","cole2016_admin", "cole2016", "id_grado", "id_grado") 
mydata <- encode_location (variables= locvars, missing=999999)


#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.

mydata <- mydata[!names(mydata) %in% "birth_date_correct"]
mydata <- mydata[!names(mydata) %in% "i15"]

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a1)[na.exclude(mydata$p7a1)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7a1", break_point=percentile_99.5, missing=-97)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7b1)[na.exclude(mydata$p7b1)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="p7b1", break_point=percentile_99.5, missing=-97)


#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("attending_confirm",
                  "dropout_why",
                  "dropout_why_1",
                  "dropout_why_2",
                  "dropout_why_3",
                  "dropout_why_4",
                  "dropout_why_5",
                  "dropout_why_6",
                  "dropout_why_7",
                  "dropout_why_8",
                  "dropout_why_9",
                  "dropout_why_10",
                  "dropout_why_11",
                  "dropout_why_12",
                  "dropout_why_13",
                  "dropout_why_14",
                  "dropout_confirm",
                  "highered_applied",
                  "highered_confirm",
                  "grado2016_confirm",
                  "grado2016_correct",
                  "grado2016",
                  "nivel2016",
                  "p14",
                  "pc_15",
                  "pc_16",
                  "dpout_month",
                  "dout_reasons",
                  "dout_reasons_1",
                  "dout_reasons_2",
                  "dout_reasons_3",
                  "dout_reasons_4",
                  "dout_reasons_5",
                  "dout_reasons_6",
                  "dout_reasons_7",
                  "dout_reasons_8",
                  "dout_reasons_9",
                  "dout_reasons_10",
                  "dout_reasons_11",
                  "dout_reasons_12",
                  "dout_reasons_13",
                  "dout_reasons_99",
                  "p16d",
                  "a2",
                  "a2c",
                  "a2d",
                  "a2e",
                  "a2_sel",
                  "a3",
                  "a3c",
                  "a3d",
                  "a3e",
                  "a3_sel",
                  "a4",
                  "a4c",
                  "a4d",
                  "a4e",
                  "a4_sel",
                  "a5",
                  "a5c",
                  "a5d",
                  "a5e",
                  "a5_sel",
                  "a6",
                  "a6c",
                  "a6d",
                  "a6e",
                  "a6_sel",
                  "a7",
                  "a7c",
                  "a7d",
                  "a7e",
                  "a7_sel",
                  "a8",
                  "a8c",
                  "a8d",
                  "a8e",
                  "a8_sel",
                  "a9",
                  "a9c",
                  "a9d",
                  "a9e",
                  "a9_sel",
                  "a10",
                  "a10c",
                  "a10d",
                  "a10e",
                  "a10_sel",
                  "a11",
                  "a11_o",
                  "a11c",
                  "a11d",
                  "a11e",
                  "a11_sel",
                  "m2",
                  "m2c",
                  "m2d",
                  "m2e",
                  "m2_sel",
                  "m3",
                  "m3c",
                  "m3d",
                  "m3e",
                  "m3_sel",
                  "m4",
                  "m4c",
                  "m4d",
                  "m4e",
                  "m4_sel",
                  "m5",
                  "m5c",
                  "m5d",
                  "m5e",
                  "m5_sel",
                  "m6",
                  "m6c",
                  "m6d",
                  "m6e",
                  "m6_sel",
                  "m7",
                  "m7c",
                  "m7d",
                  "m7e",
                  "m7_sel",
                  "m8",
                  "m8c",
                  "m8d",
                  "m8e",
                  "m8_sel",
                  "m9",
                  "m9c",
                  "m9d",
                  "m9e",
                  "m9_sel",
                  "m10",
                  "m10c",
                  "m10d",
                  "m10e",
                  "m10_sel",
                  "m11",
                  "m11c",
                  "m11d",
                  "m11e",
                  "m11_sel",
                  "p22a",
                  "p22b",
                  "p25_note",
                  "p25a1",
                  "p25a2",
                  "p25a3",
                  "p25b",
                  "p25c",
                  "p25d",
                  "p25e",
                  "p25_1_note1",
                  "p25_1a",
                  "p25_1b",
                  "p25_1c",
                  "p25_1d",
                  "p25_1e",
                  "p25_1f",
                  "p25_2g",
                  "p25_3h",
                  "p25_4i",
                  "p25_5j",
                  "p25_6k",
                  "p25_7l",
                  "p25_8m",
                  "p25_9n",
                  "p25_10o",
                  "p25_11p",
                  "p25_12q",
                  "p25_13r",
                  "p25_14s",
                  "p25_14t",
                  "p25_2_note",
                  "p25_2a",
                  "p25_2b",
                  "p25_2c",
                  "p25_2d",
                  "p25_2e",
                  "p25_2f",
                  "p25_2g1",
                  "p25_2h",
                  "p25_2i",
                  "p27_note",
                  "p27a",
                  "p27b",
                  "p27c",
                  "p27d",
                  "p27e",
                  "p34",
                  "same_school2015",
                  "same_school2014",
                  "same_school2013",
                  "same_school2012",
                  "same_school2011",
                  "same_school2010",
                  "cole_dif_5",
                  "cole_dif_6",
                  "cole_dif_7",
                  "cole_dif_8",
                  "cole_dif_9",
                  "cole_dif_10",
                  "cole_dif_11")

capture_tables (indirect_PII)


# Recode those with very specific values. 
# !!! No Indirect PII- Categorical variables with very specific values.



#'# Matching and crosstabulations: Run automated PII check 

# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('i17', 'grado2016_admin') ##!!! Replace with candidate categorical demo vars


# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
mydata [notAnon,"grado2016_admin"] <- NA



#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("i16",
               "p4b",
               "p13c1",
               "v134",
               "a2_o",
               "a2g",
               "a3_o",
               "a3g",
               "a4_o",
               "a4g",
               "a5_o",
               "a5g",
               "a6_o",
               "a6g",
               "a7_o",
               "a7g",
               "a8_o",
               "a8g",
               "a9_o",
               "a9g",
               "a10_o",
               "a10g",
               "a11g",
               "m2_o",
               "m2g",
               "m3_o",
               "m3g",
               "m4_o",
               "m4g",
               "m5_o",
               "m5g",
               "m6_o",
               "m6g",
               "m7_o",
               "m7g",
               "m8_o",
               "m8g",
               "m9_o",
               "m9g",
               "m10_o",
               "m10g",
               "m11_o",
               "m11g",
               "q48",
               "p35b1",
               "pref18a",
               "pref18b",
               "pref19a",
               "pref19b",
               "pref15f",
               "pref16f",
               "school2015_name",
               "school2015_name1_extra",
               "school2014_name",
               "school2014_name1_extra",
               "school2013_name",
               "school2013_name1_extra",
               "school2012_name",
               "school2012_name1_extra",
               "school2011_name",
               "school2011_name1_extra",
               "school2010_name",
               "school2010_name1_extra",
               "cole_dif")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# Drop all, as actually verbatim data in Spanish

mydata <- mydata[!names(mydata) %in% "i16"]
mydata <- mydata[!names(mydata) %in% "p4b"]
mydata <- mydata[!names(mydata) %in% "p13c1"]
mydata <- mydata[!names(mydata) %in% "v134"]
mydata <- mydata[!names(mydata) %in% "a2_o"]
mydata <- mydata[!names(mydata) %in% "a2g"]
mydata <- mydata[!names(mydata) %in% "a3_o"]
mydata <- mydata[!names(mydata) %in% "a3g"]
mydata <- mydata[!names(mydata) %in% "a4g"]
mydata <- mydata[!names(mydata) %in% "a5_o"]
mydata <- mydata[!names(mydata) %in% "a5g"]
mydata <- mydata[!names(mydata) %in% "a6_o"]
mydata <- mydata[!names(mydata) %in% "a6g"]
mydata <- mydata[!names(mydata) %in% "a7_o"]
mydata <- mydata[!names(mydata) %in% "a7g"]
mydata <- mydata[!names(mydata) %in% "a8_o"]
mydata <- mydata[!names(mydata) %in% "a8g"]
mydata <- mydata[!names(mydata) %in% "a9_o"]
mydata <- mydata[!names(mydata) %in% "a9g"]
mydata <- mydata[!names(mydata) %in% "a10_o"]
mydata <- mydata[!names(mydata) %in% "a10g"]
mydata <- mydata[!names(mydata) %in% "a11_o"]
mydata <- mydata[!names(mydata) %in% "a11g"]
mydata <- mydata[!names(mydata) %in% "m2_o"]
mydata <- mydata[!names(mydata) %in% "m2g"]
mydata <- mydata[!names(mydata) %in% "m3_o"]
mydata <- mydata[!names(mydata) %in% "m3g"]
mydata <- mydata[!names(mydata) %in% "m4_o"]
mydata <- mydata[!names(mydata) %in% "m4g"]
mydata <- mydata[!names(mydata) %in% "m5_o"]
mydata <- mydata[!names(mydata) %in% "m5g"]
mydata <- mydata[!names(mydata) %in% "m6_o"]
mydata <- mydata[!names(mydata) %in% "m6g"]
mydata <- mydata[!names(mydata) %in% "m7_o"]
mydata <- mydata[!names(mydata) %in% "m7g"]
mydata <- mydata[!names(mydata) %in% "m8_o"]
mydata <- mydata[!names(mydata) %in% "m8g"]
mydata <- mydata[!names(mydata) %in% "m9_o"]
mydata <- mydata[!names(mydata) %in% "m9g"]
mydata <- mydata[!names(mydata) %in% "m10_o"]
mydata <- mydata[!names(mydata) %in% "m10g"]
mydata <- mydata[!names(mydata) %in% "m11_o"]
mydata <- mydata[!names(mydata) %in% "m11g"]
mydata <- mydata[!names(mydata) %in% "q48"]
mydata <- mydata[!names(mydata) %in% "p35b1"]
mydata <- mydata[!names(mydata) %in% "pref18a"]
mydata <- mydata[!names(mydata) %in% "pref18b"]
mydata <- mydata[!names(mydata) %in% "pref19a"]
mydata <- mydata[!names(mydata) %in% "pref19b"]
mydata <- mydata[!names(mydata) %in% "pref15f"]
mydata <- mydata[!names(mydata) %in% "pref16f"]
mydata <- mydata[!names(mydata) %in% "school2015_name"]
mydata <- mydata[!names(mydata) %in% "school2015_name1_extra"]
mydata <- mydata[!names(mydata) %in% "school2014_name"]
mydata <- mydata[!names(mydata) %in% "school2014_name1_extra"]
mydata <- mydata[!names(mydata) %in% "school2013_name"]
mydata <- mydata[!names(mydata) %in% "school2013_name1_extra"]
mydata <- mydata[!names(mydata) %in% "school2012_name"]
mydata <- mydata[!names(mydata) %in% "school2012_name1_extra"]
mydata <- mydata[!names(mydata) %in% "school2011_name"]
mydata <- mydata[!names(mydata) %in% "school2011_name1_extra"]
mydata <- mydata[!names(mydata) %in% "school2010_name"]
mydata <- mydata[!names(mydata) %in% "school2010_name1_extra"]
mydata <- mydata[!names(mydata) %in% "cole_dif"]
mydata <- mydata[!names(mydata) %in% "v134"]
mydata <- mydata[!names(mydata) %in% "a4_o"]

#'# GPS data: Displace
#  No GPS data

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
