## Setup automatic checks for SDC risk. Grep call will ignore case, so it is enough to enter lower case. 
# Direct PII

Direct.lab <- c("name")
Direct.var <- c("nam")

# Indirect PII

Indirect.lab <- c("gender", "sex", "edu", "occupation")
Indirect.var <- c("age", "edu")

# GPS

GPS.lab <- c("latitude", "longitude", "gps")
GPS.var <- c("lo", "la")

# Open-end variables

open.lab <- c("Comment", "comment", "verbatim", "Verbatim", "text", "Text", "Translation", "translation")
open.var <- c()

## Preliminary flagger function

flagger <- function (dictionary, lab.strings, var.strings, flag) {
  dictionary$PII_disclosure_risk <- as.character(dictionary$PII_disclosure_risk)
  index <- unique (c(grep(paste(c(var.strings, lab.strings),collapse="|"), 
                          dictionary$Variable, 
                          ignore.case=TRUE), 
                     grep(paste(lab.strings,collapse="|"), 
                          dictionary$Label, 
                          ignore.case=TRUE)))
  dictionary$PII_disclosure_risk[index] <- paste0(dictionary$PII_disclosure_risk[index], paste0(", ", flag))
  return (dictionary)
}

# Dictionary function

dictionary <- function (data) {
  numVars <- length(colnames(mydata))
  dict <- matrix(nrow = numVars, ncol = 13)
  dict[,1] <- colnames(mydata)
  dict2  <- sapply(mydata, function(x) attr(x, "label"))
  for (i in 1:length(dict2)) {
    dict[i,2] <- ifelse(is.null(dict2[[i]]), "", dict2[[i]])
  }
  dict3 <- sapply(mydata, class)
  for (i in 1:length(dict3)) {
    dict[i,3] <- ifelse(is.null(dict3[[i]]), "", dict3[[i]])
  }
  dict[,12] <- ""
  for (i in 1:numVars) {
    #print(paste(i, "out of", numVars, "variables")) # Uncomment if dataset is long and you want to check progress
    tableNA <- as.data.frame(table(mydata[,i], useNA="always")) 
    dict [i,4] <- tableNA[which(is.na(tableNA$Var1)),2] # Number not missing "No-NA"
    tableFreqs <- as.data.frame(table(mydata[,i]))
    if (length(tableFreqs)>1) {
      tableFreqs <- tableFreqs[order(tableFreqs$Freq),]
      dict [i,5] <- tableFreqs[1,1] # Lowest frequency value
      dict [i,6] <- tableFreqs[1,2] # Frequency of lowest frequency value
    } else {
      dict [i,7:8] <- NA
      dict [i,12] <- "All NA"
    }
    dict [i,7:11] <- t(head(mydata[,i], 5)) # Sample of 5 top rows
  }
  dict <- as.data.frame(dict)
  colnames(dict) <- c("Variable", "Label", "Class", "Count_NA", "Lowest_Freq_Value", "Lowest_Freq", 
                      "Row1","Row2","Row3","Row4","Row5", "PII_disclosure_risk", "Action")

  dict<-flagger(dictionary=dict, lab.strings=Direct.lab, var.strings=Direct.var, flag="Direct PII")
  dict<-flagger(dictionary=dict, lab.strings=Indirect.lab, var.strings=Indirect.var, flag="Indirect PII")
  dict<-flagger(dictionary=dict, lab.strings=GPS.lab, var.strings=GPS.var, flag="GPS")
  dict<-flagger(dictionary=dict, lab.strings=open.lab, var.strings=open.var, flag="Open-End")
  
  #Save dictionary and data as csv for inspection
  
  write.csv(dict, paste(filename, "_dictionary.csv", sep=""))
  write.csv(mydata, paste(filename, ".csv", sep=""))
  }

### GEOGRAPHIC DISPLACEMENT PROCEDURE AND GEOREFERENCED DATA RELEASE POLICY FOR DHS
# Merge displaced GPS into dataset

displace.merger <- function (displacedGPS, gps.vars) {
  displaced.df <- data.frame(coordinates(displacedGPS))
  row.names(displaced.df) <- which(complete.cases(mydata[,gps.vars]))
  mostattributes(displaced.df$coords.x1) <- attributes(mydata[,gps.vars[1], drop=T])
  mostattributes(displaced.df$coords.x2) <- attributes(mydata[,gps.vars[1], drop=T])
  mydata.displaced <- left_join(rownames_to_column(mydata), 
                                rownames_to_column(displaced.df), 
                                by = ("rowname"))
  mydata.displaced[,gps.vars] <- mydata.displaced[,c("coords.x1", "coords.x2")]
  mydata.displaced <- mydata.displaced[!names(mydata.displaced) %in% c("rowname", 
                                                                       "coords.x1", 
                                                                       "coords.x2")]
  return (mydata.displaced)
}

### Functon below adapted from : https://dhsprogram.com/pubs/pdf/SAR7/SAR7.pdf

displace <- function(gps.vars, admin, samp_num, other_num){
  #gps.vars must include two vectors in the following order: Longitude, Latitude.
  start <- Sys.time()
  map1 <- ggplot() + geom_polygon(data=countrymap, aes(x=long, y=lat, group=group), fill="darkgrey", alpha=0.3) +
    geom_point(data=mydata, aes_string(x=gps.vars[1], y=gps.vars[2])) + 
    ggtitle(paste0("Original GPS points: ", colnames=(gps.vars[1]), ", ", colnames=(gps.vars[2])))
  plot(map1)
  print("Summary Long/Lat statistics before displacement")
  print(summary(mydata[gps.vars]))
  coords <- as.data.frame(mydata[complete.cases(mydata[gps.vars]),gps.vars])
  URBAN_RURA <- as.data.frame(rep("R", length(coords[,1])))
  colnames(URBAN_RURA) <- "URBAN_RURA"
  crs_proj    <- admin@proj4string 
  pts <- SpatialPointsDataFrame(coords, data=cbind(URBAN_RURA), proj4string = crs_proj)
  
  n <- length(pts)
  offset.dist <- ifelse(pts$URBAN_RURA == "U", 0.025, 0.05) # Tweak for offset distance
  rural <- which(pts$URBAN_RURA == "R")
  rur.n <- floor(0.01*length(rural))
  offset.dist[sample(rural, rur.n, replace = FALSE)] <- 0.1 # Tweak for offset distance
  r.pts0 <- list(0)
  for(i in 1:nrow(pts)){
    r.pts0[[i]]<-matrix(0,nrow=samp_num,ncol=2)
    #-- Buffer around point --#
    pdsc <- disc(radius = offset.dist[i], centre = c(coordinates(pts)[i,1],
                                                     coordinates(pts)[i,2]))
    pdsc <- as(pdsc, "SpatialPolygons")
    proj4string(pdsc) <- CRS(as.character(crs_proj))
    #-- Intersection with admin --#
    int <- gIntersection(pdsc, admin)
    #-- Generating random point
    if(!is.null(int)){
      rpt <- csr(int@polygons[[1]]@Polygons[[1]]@coords, other_num)
      probs<-1/rdist(coordinates(pts[i,]),rpt)
      rpt<-rpt[sample(c(1:other_num),size=samp_num,prob=(probs/sum(probs))),]
      r.pts0[[i]] <- rpt
    }
    if(is.null(int)){
      rpt <- csr(pdsc@polygons[[1]]@Polygons[[1]]@coords, other_num)
      probs<-1/rdist(coordinates(pts[i,]),rpt)
      rpt<-rpt[sample(c(1:other_num),size=samp_num,prob=(probs/sum(probs))),]
      r.pts0[[i]] <- rpt
    }
  }
  #Arranging the Output
  if(samp_num==1){
    r.pts<-list(0)
    r.pts[[1]]<-matrix(0,nrow=n,ncol=2)
    for(k in 1:n){
      r.pts[[1]][k,]<-c(r.pts0[[k]])
    }
    r.pts[[1]]<- SpatialPoints(r.pts[[1]], CRS(as.character(crs_proj)))
  }
  if(samp_num>1){
    r.pts<-list(0)
    for(j in 1:samp_num){
      r.pts[[j]]<-matrix(0,nrow=n,ncol=2)
      for(k in 1:n){
        r.pts[[j]][k,]<-r.pts0[[k]][j,]
      }
      r.pts[[j]]<- SpatialPoints(r.pts[[j]], CRS(as.character(crs_proj)))
    }
  }

  mydata_displaced <- displace.merger(r.pts, gps.vars)
  plot(map1 + geom_point(alpha = 0.05, data=mydata, 
                         aes_string(x=gps.vars[1], 
                                    y=gps.vars[2]), 
                         colour = "red")
       + ggtitle(paste0("Displaced GPS points: ", colnames=(gps.vars[1]), ", ", colnames=(gps.vars[2]))))
  print("Summary Long/Lat statistics after displacement")
  print(summary(mydata_displaced[gps.vars]))
  end <- Sys.time()
  print(paste0("Processing time = ", end-start))
  return(mydata_displaced)
}


# Check distance between original and displaced points

distance.checker <- function (gps.vars, displacedGPS) {
  coords1 <- mydata[complete.cases(mydata[gps.vars]),gps.vars]
  coords2 <- as.data.frame(coordinates(displacedGPS))
  distance <- c()
  distance[1] <- 0
  for (i in 1:nrow(coords1)) {
    distance[i] <- distm (coords1[i,], coords2[i,], fun = distHaversine)
    }
  print ("Distance between original and displaced GPS data")
  plot(hist(distance))
  print(summary(distance))
}

# Label dataset to fix problems after recoding variables

labelDataset <- function(data) {
  correctLabel <- function(x) {
    
    if(!is.null(attributes(x)$labels)) {
      class(attributes(x)$labels) <- typeof(x)
    }
    return(x)
  }
  for(i in colnames(data)) {
    data[, i] <- correctLabel(data[, i])
  }
  return(data)
}

# Function to produce tables using haven variable names, labels and value labels 

haven_table <- function (varname, data=mydata) {
  table(droplevels(haven::as_factor(data[,varname])), 
  dnn = paste0(varname, ". ", attr(data[[varname]], "label")),
  useNA = "ifany")
}

# Encode Direct PII-team: Encodes strings and drops value labels. Keeps variable labels.

encode_direct_PII_team <- function (variables){
  mydata2 <- mydata
  for (i in variables) {
    print("Frequency table before encoding")
    print(haven_table(i))
    label <- attr(mydata[[i]], "label")
    mydata2[,i] <- as.integer(factor(mydata[[i]]))
    print("Frequency table after encoding")
    var_label(mydata2[,i]) <- label # Add variable label back for table check
    print(haven_table(i, data = mydata2))
    mydata[,i] <- zap_labels(mydata2[,i]) # May be needed for variables with value labels
    var_label(mydata[,i]) <- label # Add variable label back though
  }
  return (mydata)
}

# Encode small locations. 
# Function turns factors labels into numeric labels, adding a random number between 100 and 1000
# The random number is added to eliminate potential confusion with official codes for the encoded location (e.g. census)

encode_location <- function (variables, missing){
  seed <- 123456123
  mydata2 <- mydata
  for (i in variables) {
    set.seed(seed)
    label <- attr(mydata[[i]], "label")
    print("Frequency table before encoding")
    print(haven_table(i))
    missing_rows <- mydata[,i]==missing
    mydata2[,i] <- as.integer(factor(mydata[[i]])) + sample(100:1000, 1)
    mydata2[,i][missing_rows] <- missing
    var_label(mydata2[,i]) <- label # Add variable label back for table check
    print("Frequency table after encoding")    
    print(haven_table(i, data = mydata2))
    mydata[,i] <- zap_labels(mydata2[,i]) # May be needed for variables with value labels
    var_label(mydata[,i]) <- label # Add variable label back though
    seed <- seed + 1
  }
  return (mydata)
}

# Capture tables

capture_tables <- function (variables) {
  pii.tables <- list() # initalize an empty list
  counter <- 1
  for (i in indirect_PII) {
    pii.tables[[counter]] <- table(droplevels(haven::as_factor(mydata[,i])), 
                                 dnn = paste0(i, ". ", attr(mydata[[i]], "label")), 
                                 useNA = "ifany")
  counter <- counter+1
  }
capture.output(pii.tables, file = "pii.tables.txt")
}

# Global recoding function for "Indirect PII - ordinal" variables

ordinal_recode <- function (variable, break_points, missing, value_labels) {
  label <- var_label(mydata[,variable])
  missing_rows <- mydata[,variable]==missing
  hist(mydata[,variable][!missing_rows])
  table(mydata[,variable][!missing_rows])
  recoded <- cut(mydata[[variable]], breaks=c(break_points, missing+1), right=F) ###!!! Adapt breaks to data
  print(table(mydata[[variable]], recoded))
  mydata[,variable] <- to_labelled(recoded)
  var_label(mydata[,variable]) <- label
  if(missing(value_labels)) {} else {
    val_labels(mydata[[variable]]) <- value_labels
  }
  print(haven_table(variable, data = mydata))
  print("Inspect value labels and relabel as necessary")
  print(val_labels(mydata[[variable]]))
  return(mydata)
}

# Top-coding function for "Indirect PII - ordinal" variables

top_recode <- function (variable, break_point, missing) {
  print("Frequency table after encoding") 
  print(haven_table(variable))
  value_labels <- val_labels(mydata[[variable]])
  missing_rows <- mydata[[variable]] %in% missing
  plot(hist(mydata[[variable]][!missing_rows]))
  mydata2 <- mydata
  mydata2[,variable][mydata2[,variable]>break_point & !missing_rows] <- break_point
  val_labels(mydata2[[variable]]) <- value_labels
  val_label(mydata2[[variable]], break_point) <- paste0(break_point, "or more")
  print("Frequency table after encoding") 
  print(haven_table(variable, data=mydata2))
  return(mydata2)
}

report_open <- function (list_open_ends) {
  dir.create(file.path(getwd(), 'verbatims'), recursive = TRUE)
  for (i in list_open_ends) {
    index <- nzchar(apply(mydata[,i],2,function(x)gsub('\\s+', '',x))) #Identifies location of non-empty strings after removing " ". 
    verbatim.df <- cbind(rownames(mydata)[index], mydata[index,i])
    colnames(verbatim.df) <- c("Row.names", i)
    write.csv(verbatim.df, paste0("./verbatims/", i, ".csv"))
  }
}


# Setup environment

set.seed(3333)
location <- getwd()
if (!require("pacman")) install.packages("pacman")
library ("pacman")
pacman::p_load(haven, sdcMicro, dplyr, maps, ggplot2, maptools, raster, rgdal, 
               spatstat, rgeos, splancs, fields, geosphere, stringr, tibble, labelled)
mydata <- haven::read_dta(paste0(filename, ".dta"))

dictionary("mydata") #Create dictionary for inspection


