rm(list=ls(all=t))
filename <- "Malawi_HHHead_Public Use" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!Include any Direct PII variables
dropvars <- c("b_edunames_",
"b_empname_",
"b_else_members_",
"b_tob_members_",
"b_rel_names_")
mydata <- mydata[!names(mydata) %in% dropvars]
!!!No Direct PII-Team #Small locations: Encode locations with pop <100,000 using random large numbers !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("community",
"comm",
"b_treat",
"b_comm",
"e_community",
"e_comm",
"b_ta",
"e_ta")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## community. Community
## Chaola Chazim'bobo Chikho 2 Chinyata Choumba Kakoloha Kanongo Luwira Mafuta
## 27926 363 312 1145 658 627 288 845 237 345
## Mkombezi Mlambe Mzokoto Nanzomba Ndaula Nyongani Pondani Tamanimwendo Waliranji
## 553 496 625 584 728 73 597 552 811
## [1] "Frequency table after encoding"
## community. Community
## 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
## 811 27926 363 552 237 496 584 73 553 288 1145 845 312 627 728 345 658 597 625
## [1] "Frequency table before encoding"
## comm. Numeric Values For Community
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 <NA>
## 658 345 728 73 552 363 312 584 597 625 627 845 811 1145 496 288 237 553 27926
## [1] "Frequency table after encoding"
## comm. Numeric Values For Community
## 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 <NA>
## 496 584 288 728 1145 552 312 845 625 658 811 627 237 73 553 345 597 363 27926
## [1] "Frequency table before encoding"
## b_treat. b_Community Name (Lower)
## chaola chazim'bobo chikho 2 chinyata choumba kakoloha kanongo luwira
## 19563 211 535 2672 1131 1091 517 1555 441
## mafuta mkombezi mlambe mzokoto nanzomba ndaula nyongani pondani tamani mwendo
## 675 1094 802 1162 1056 1338 115 1200 1001
## waliranji
## 1606
## [1] "Frequency table after encoding"
## b_treat. b_Community Name (Lower)
## 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
## 115 802 1056 1606 535 1162 1555 2672 1338 19563 1131 441 211 675 517 1200 1001 1094 1091
## [1] "Frequency table before encoding"
## b_comm. b_Community
## CHAOLA CHAZIM'BOBO CHIKHO 2 CHINYATA CHOUMBA KAKOLOHA KANONGO LUWIRA
## 19563 211 535 2672 1131 1091 517 1555 441
## MAFUTA MKOMBEZI MLAMBE MZOKOTO NANZOMBA NDAULA NYONGANI PONDANI TAMANI MWENDO
## 675 1094 802 1162 1056 1338 115 1200 1001
## WALIRANJI
## 1606
## [1] "Frequency table after encoding"
## b_comm. b_Community
## 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
## 1555 1091 1162 1131 675 517 1001 802 535 115 211 1056 1606 1338 441 19563 1094 1200 2672
## [1] "Frequency table before encoding"
## e_community. e_Community
## Chaola Chazim'bobo Chikho 2 Chinyata Choumba Kakoloha Kanongo Luwira Mafuta
## 18202 705 600 2270 1277 1213 601 1707 479 704
## Mkombezi Mlambe Mzokoto Nanzomba Ndaula Nyongani Pondani Tamanimwendo Waliranji
## 1142 947 1302 1145 1411 138 1174 1106 1642
## [1] "Frequency table after encoding"
## e_community. e_Community
## 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
## 1411 705 1142 947 1145 1277 1106 1213 600 2270 704 1707 1174 601 1642 1302 138 18202 479
## [1] "Frequency table before encoding"
## e_comm. e_Numeric Values For Community
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 <NA>
## 1277 704 1411 138 1106 705 600 1145 1174 1302 1213 1707 1642 2270 947 601 479 1142 18202
## [1] "Frequency table after encoding"
## e_comm. e_Numeric Values For Community
## 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 <NA>
## 947 1213 1277 600 1106 138 704 1302 1411 2270 601 705 1174 479 1707 1142 1642 1145 18202
## [1] "Frequency table before encoding"
## b_ta. b_Traditional Authority
## KASAKULA MAVWERE MWANKHUNIKILA
## 19563 6476 8512 3214
## [1] "Frequency table after encoding"
## b_ta. b_Traditional Authority
## 915 916 917 918
## 19563 6476 8512 3214
## [1] "Frequency table before encoding"
## e_ta. e_Traditional Authority Name:
## ,mavwere 1 3514007 Asakula Bokosi
## 18202 5 3 6 4 6
## Chigunda Chikho CHIKHO CHIKHO 2 Chinthumba Chinyata
## 8 322 108 4 6 22
## Jasakula Kabudula Kachiza Kamtsukwa Kasakuka kasakula
## 5 5 3 5 3 15
## Kasakula KASAKULA Kasakula 2(Lufeyo) Kasakule Kasakulla Kasakulu
## 5757 361 3 4 4 4
## KASAUKULA Kaskula Kasskula Kasukula Kasula 1 Katsakula
## 3 6 7 10 3 150
## MABVERE Maliketi Mamvere Mamwere Manvwere Mavere
## 87 10 5 11 5 5
## Mavwele Mavwera mavwere Mavwere MAVWERE Mavwerer
## 72 4 15 8675 62 3
## Mawvere Mchinji Mlambe Mwakhunikila Mwamvele MWANHKUNIKIRA
## 150 6 8 4 9 11
## Mwanhunikira Mwankhuni Mwankhunikala Mwankhunikil Mwankhunikila mwankhunikira
## 4 6 33 5 1032 43
## Mwankhunikira MWANKHUNIKIRA Mwankhunikjra Mwankhunikra Mwankhunikura Mwankhuninikira
## 1234 1079 6 23 3 5
## Mwankhunira Mwankhunkila Navwere NWANKHUNIKIRA Pondani Rosalina Josamu
## 7 7 4 7 6 4
## SAKULA TA Kasakula TA Mavwele Vesinati Mayeso Victor Phiri Waliranji
## 3 23 8 4 4 14
## Wankhunikila Wmankhunikila
## 9 6
## [1] "Frequency table after encoding"
## e_ta. e_Traditional Authority Name:
## 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
## 4 6 11 361 10 11 23 3 87 6 18202 43 4 4 3 23 1079 8 5 6 4
## 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
## 6 150 4 5 9 7 7 1234 4 3 8 9 4 6 8 72 5 15 5 10 15
## 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
## 4 6 4 5757 3 4 6 5 7 6 62 5 3 3 14 108 4 22 5 6 150
## 541 542 543 544 545 546 547 548 549 550 551
## 8675 1032 7 5 3 5 4 3 322 3 33
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata <- top_recode ("num_people", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members.
## [1] "Frequency table before encoding"
## num_people. Can You Please Tell Me How Many People Live In This Household, Including Yoursel
## 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 77 391 1195 2072 2381 1872 1024 474 189 115 37 4 8 27926
## [1] "Frequency table after encoding"
## num_people. Can You Please Tell Me How Many People Live In This Household, Including Yoursel
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 77 391 1195 2072 2381 1872 1024 474 353 27926
mydata <- top_recode ("b_hhcount", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members.
## [1] "Frequency table before encoding"
## b_hhcount. b_Number Of Household Members
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 5 190 914 2679 4003 3843 3071 1844 1010 387 137 65 43 11 19563
## [1] "Frequency table after encoding"
## b_hhcount. b_Number Of Household Members
## 1 2 3 4 5 6 7 8 9 10 or more <NA>
## 5 190 914 2679 4003 3843 3071 1844 1010 643 19563
mydata <- top_recode ("e_hhcount", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members.
## [1] "Frequency table before encoding"
## e_hhcount. e_Number Of Household Members
## 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 154 910 2678 4169 4500 3508 1995 942 387 210 88 10 12 18202
## [1] "Frequency table after encoding"
## e_hhcount. e_Number Of Household Members
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 154 910 2678 4169 4500 3508 1995 942 707 18202
mydata <- top_recode ("e_num_people", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members.
## [1] "Frequency table before encoding"
## e_num_people. e_Can You Please Tell Me How Many People Live In This Household, Including Yours
## 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 154 910 2678 4169 4500 3508 1995 942 387 210 88 10 12 18202
## [1] "Frequency table after encoding"
## e_num_people. e_Can You Please Tell Me How Many People Live In This Household, Including Yours
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 154 910 2678 4169 4500 3508 1995 942 707 18202
# Top code number of loans taken in the last year
mydata <- top_recode ("e_loannum", break_point=10, missing=c(88)) # Topcode cases with 10 or more loans taken.
## [1] "Frequency table before encoding"
## e_loannum. e_Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 88 <NA>
## 1456 4335 2657 1756 743 482 221 88 65 34 86 12 35 9 4 83 25699
## [1] "Frequency table after encoding"
## e_loannum. e_Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10 or more
## 1456 4335 2657 1756 743 482 221 88 65 34 146
## 88 <NA>
## 83 25699
mydata <- top_recode ("loannum", break_point=10, missing=c(88)) # Topcode cases with 10 or more loans taken.
## [1] "Frequency table before encoding"
## loannum. Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 88 <NA>
## 4471 2209 1330 892 371 250 106 47 35 16 46 5 17 2 2 40 27926
## [1] "Frequency table after encoding"
## loannum. Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10 or more
## 4471 2209 1330 892 371 250 106 47 35 16 72
## 88 <NA>
## 40 27926
mydata <- top_recode ("b_loannum", break_point=10, missing=88) # Topcode cases with 10 or more loans taken.
## [1] "Frequency table before encoding"
## b_loannum. b_Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10
## 178 2849 1518 1678 650 397 222 39 76 28 92
## 12 15 19 Don't know <NA>
## 18 4 4 32 29980
## [1] "Frequency table after encoding"
## b_loannum. b_Number Of Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 8 9 10 or more
## 178 2849 1518 1678 650 397 222 39 76 28 118
## Don't know <NA>
## 32 29980
mydata <- top_recode ("b_tobearn_", break_point=5000, missing=c(66666, 88888)) # Topcode cases with 10 or more loans taken.
## [1] "Frequency table before encoding"
## b_tobearn_. b_(Tobearn) In A Typical Week, How Much Did <<Name>> Earn From Working In Tobacc
## None 5 21 30 50 100 120 150 200 250 300
## 44 1 1 1 2 3 1 6 27 2 12
## 350 400 450 500 550 600 700 750 800 1000 1050
## 1 5 2 15 2 5 3 2 6 34 1
## 1200 1250 1400 1500 1800 2000 2500 2800 3000 3200 3500
## 1 1 8 6 2 14 1 1 9 1 4
## 4000 4500 4900 5000 6000 7000 8000 9000 10000 15000 20000
## 7 1 1 8 3 3 1 1 1 1 1
## 30000 50000 Ik kind Don't know <NA>
## 2 2 35 16 37458
## [1] "Frequency table after encoding"
## b_tobearn_. b_(Tobearn) In A Typical Week, How Much Did <<Name>> Earn From Working In Tobacc
## None 5 21 30 50 100 120 150 200 250
## 44 1 1 1 2 3 1 6 27 2
## 300 350 400 450 500 550 600 700 750 800
## 12 1 5 2 15 2 5 3 2 6
## 1000 1050 1200 1250 1400 1500 1800 2000 2500 2800
## 34 1 1 1 8 6 2 14 1 1
## 3000 3200 3500 4000 4500 4900 5000 or more Ik kind Don't know <NA>
## 9 1 4 7 1 1 23 35 16 37458
mydata <- top_recode ("b_adultearn_", break_point=50000, missing=c(66666, 88888)) # Topcode cases with 10 or more loans taken.
## [1] "Frequency table before encoding"
## b_adultearn_. b_(Adultearn) How Much Did You/<<Name>> Earn In Wages, Salary, Commission Or Pay
## 0 2 7 10 20 50 88 100 150 200 250
## 43 1 1 1 2 1 1 11 4 18 4
## 300 350 400 450 500 600 700 750 800 900 1000
## 14 3 13 2 89 14 7 4 14 4 172
## 1050 1060 1100 1150 1200 1300 1400 1500 1600 1650 1700
## 1 2 1 1 17 2 1 93 2 1 5
## 1800 2000 2200 2250 2400 2500 2800 3000 3100 3200 3250
## 7 227 1 1 2 77 3 195 2 1 1
## 3300 3500 3600 3700 3750 3800 4000 4250 4500 4800 5000
## 1 38 3 1 1 1 103 1 8 1 171
## 5500 6000 6500 6666 7000 7200 7500 7800 8000 8500 9000
## 7 46 5 3 55 6 2 1 33 1 11
## 10000 11000 11600 12000 12500 13000 13500 14000 14400 14500 15000
## 102 3 1 20 1 6 1 7 4 1 37
## 15550 16000 17000 17500 18000 19000 20000 21000 22000 24000 25000
## 1 6 3 1 15 3 46 4 3 1 8
## 27000 28000 28700 30000 32000 35000 36000 40000 40600 43000 45000
## 1 1 1 28 1 4 1 8 1 1 3
## 50000 54000 56000 60000 65000 In kind 70000 80000 85000 87000 Don't know
## 14 1 1 9 1 202 1 5 2 1 147
## 90000 <NA>
## 1 35491
## [1] "Frequency table after encoding"
## b_adultearn_. b_(Adultearn) How Much Did You/<<Name>> Earn In Wages, Salary, Commission Or Pay
## 0 2 7 10 20 50 88 100 150
## 43 1 1 1 2 1 1 11 4
## 200 250 300 350 400 450 500 600 700
## 18 4 14 3 13 2 89 14 7
## 750 800 900 1000 1050 1060 1100 1150 1200
## 4 14 4 172 1 2 1 1 17
## 1300 1400 1500 1600 1650 1700 1800 2000 2200
## 2 1 93 2 1 5 7 227 1
## 2250 2400 2500 2800 3000 3100 3200 3250 3300
## 1 2 77 3 195 2 1 1 1
## 3500 3600 3700 3750 3800 4000 4250 4500 4800
## 38 3 1 1 1 103 1 8 1
## 5000 5500 6000 6500 6666 7000 7200 7500 7800
## 171 7 46 5 3 55 6 2 1
## 8000 8500 9000 10000 11000 11600 12000 12500 13000
## 33 1 11 102 3 1 20 1 6
## 13500 14000 14400 14500 15000 15550 16000 17000 17500
## 1 7 4 1 37 1 6 3 1
## 18000 19000 20000 21000 22000 24000 25000 27000 28000
## 15 3 46 4 3 1 8 1 1
## 28700 30000 32000 35000 36000 40000 40600 43000 45000
## 1 28 1 4 1 8 1 1 3
## 50000 or more In kind Don't know <NA>
## 36 202 147 35491
# Top code high income to the 99.5 percentile
mydata <- top_recode (variable="save1", break_point=70000, missing=88888)
## [1] "Frequency table before encoding"
## save1. Household Savings Amount In The Last Month
## 0 100 200 300 400 500 600 800 1000 1200 1300 1400 1500 1600 1800 2000 2200 2300
## 7699 5 52 10 45 82 32 15 261 15 2 3 48 2 6 279 4 5
## 2400 2500 2800 3000 3200 3500 3600 3700 3800 4000 4500 4600 5000 5500 5600 6000 7000 7500
## 4 32 2 144 4 21 6 3 3 121 6 1 241 3 1 45 26 3
## 8000 9000 9500 10000 11000 11500 11600 12000 13000 14000 15000 16000 17500 18000 19000 20000 21000 24000
## 37 6 3 114 8 4 1 6 12 6 42 4 2 4 2 101 3 14
## 25000 26000 27000 30000 35000 36000 40000 42000 44000 45000 50000 52000 60000 70000 72000 75000 80000 88888
## 10 3 6 47 23 1 10 2 1 3 50 4 7 7 2 1 2 16
## 1e+05 120000 121000 130000 150000 2e+05 5e+05 <NA>
## 21 4 1 3 7 6 2 27926
## [1] "Frequency table after encoding"
## save1. Household Savings Amount In The Last Month
## 0 100 200 300 400 500 600 800 1000
## 7699 5 52 10 45 82 32 15 261
## 1200 1300 1400 1500 1600 1800 2000 2200 2300
## 15 2 3 48 2 6 279 4 5
## 2400 2500 2800 3000 3200 3500 3600 3700 3800
## 4 32 2 144 4 21 6 3 3
## 4000 4500 4600 5000 5500 5600 6000 7000 7500
## 121 6 1 241 3 1 45 26 3
## 8000 9000 9500 10000 11000 11500 11600 12000 13000
## 37 6 3 114 8 4 1 6 12
## 14000 15000 16000 17500 18000 19000 20000 21000 24000
## 6 42 4 2 4 2 101 3 14
## 25000 26000 27000 30000 35000 36000 40000 42000 44000
## 10 3 6 47 23 1 10 2 1
## 45000 50000 52000 60000 70000 or more 88888 <NA>
## 3 50 4 7 56 16 27926
percentile_99.5 <- percentile_checker("saveall", missing=c(88888))
mydata <- top_recode (variable="saveall", break_point=percentile_99.5, missing=88888)
## [1] "Frequency table before encoding"
## saveall. Total Household Savings
## 0 1 9 100 200 300 400 450 500 600 800 900 1000 1200 1300 1400
## 7097 5 4 3 41 11 25 2 69 31 19 8 185 11 2 12
## 1500 1600 1800 2000 2200 2300 2400 2500 2600 2800 2900 3000 3200 3400 3500 3600
## 69 5 9 253 4 5 4 29 3 6 2 151 7 6 35 5
## 3800 4000 4300 4500 5000 5500 5600 5800 6000 6500 7000 7500 7600 7700 8000 8500
## 3 82 3 6 269 3 2 2 63 5 43 8 1 3 59 3
## 8888 9000 9500 10000 10500 11000 11500 11600 12000 12800 13000 13600 14000 14500 15000 16000
## 1 17 3 210 4 18 6 1 40 2 24 1 15 3 105 9
## 17200 17500 18000 19000 19300 20000 21000 22000 24000 25000 26000 27000 30000 31000 32000 33500
## 3 2 14 2 3 135 7 4 6 27 2 6 83 1 6 1
## 34000 35000 36000 37000 39000 40000 42000 43000 44000 45000 46000 50000 52000 60000 65000 70000
## 1 17 1 3 3 26 2 3 4 13 6 62 4 23 2 9
## 74000 80000 85000 88888 89000 90000 94000 95000 1e+05 106000 108000 120000 121000 125000 130000 140000
## 3 8 3 36 2 2 3 3 47 4 4 9 1 4 3 4
## 142000 144000 150000 160000 172000 175000 2e+05 250000 3e+05 320000 350000 450000 5e+05 566000 6e+05 8e+05
## 4 3 29 1 2 2 4 8 5 2 2 4 8 3 3 1
## 1e+06 1500000 2500000 3e+06 <NA>
## 1 2 3 3 27926
## [1] "Frequency table after encoding"
## saveall. Total Household Savings
## 0 1 9 100 200 300 400 450
## 7097 5 4 3 41 11 25 2
## 500 600 800 900 1000 1200 1300 1400
## 69 31 19 8 185 11 2 12
## 1500 1600 1800 2000 2200 2300 2400 2500
## 69 5 9 253 4 5 4 29
## 2600 2800 2900 3000 3200 3400 3500 3600
## 3 6 2 151 7 6 35 5
## 3800 4000 4300 4500 5000 5500 5600 5800
## 3 82 3 6 269 3 2 2
## 6000 6500 7000 7500 7600 7700 8000 8500
## 63 5 43 8 1 3 59 3
## 8888 9000 9500 10000 10500 11000 11500 11600
## 1 17 3 210 4 18 6 1
## 12000 12800 13000 13600 14000 14500 15000 16000
## 40 2 24 1 15 3 105 9
## 17200 17500 18000 19000 19300 20000 21000 22000
## 3 2 14 2 3 135 7 4
## 24000 25000 26000 27000 30000 31000 32000 33500
## 6 27 2 6 83 1 6 1
## 34000 35000 36000 37000 39000 40000 42000 43000
## 1 17 1 3 3 26 2 3
## 44000 45000 46000 50000 52000 60000 65000 70000
## 4 13 6 62 4 23 2 9
## 74000 80000 85000 88888 89000 90000 94000 95000
## 3 8 3 36 2 2 3 3
## 1e+05 106000 108000 120000 121000 125000 130000 140000
## 47 4 4 9 1 4 3 4
## 142000 144000 150000 160000 172000 175000 or more <NA>
## 4 3 29 1 2 51 27926
percentile_99.5 <- percentile_checker("loanval", missing=c(88888,888888))
mydata <- top_recode (variable="loanval", break_point=percentile_99.5, missing=c(88888,888888))
## [1] "Frequency table before encoding"
## loanval. Value Of All Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 10 11 16 60 75 88 100 110
## 4471 12 15 7 7 2 1 3 4 3 4 2 7 2 8 1
## 200 250 300 350 400 500 600 700 750 760 800 1000 1200 1400 1500 1700
## 14 6 9 6 10 56 2 7 10 6 5 128 5 4 43 4
## 1750 2000 2200 2270 2300 2400 2500 2600 2700 3000 3500 3600 4000 4500 4600 4700
## 1 223 6 1 2 3 15 3 3 149 12 2 111 12 2 3
## 4800 5000 5200 5500 5800 6000 6500 7000 7500 7680 8000 8400 8500 9000 9500 9720
## 6 296 4 4 3 143 1 77 6 1 114 3 2 86 18 1
## 10000 10500 10800 11000 11200 11300 11500 12000 12500 12600 13000 13500 13600 14000 14400 14500
## 510 17 2 20 4 3 3 122 4 4 38 3 4 43 4 3
## 15000 15500 15600 16000 16500 16800 17000 17400 17500 18000 18400 18500 19000 19700 20000 20500
## 249 9 4 34 5 2 40 4 7 46 2 4 11 4 361 1
## 21000 22000 22500 22700 23000 23500 24000 24020 25000 25500 26000 27000 28000 29000 29400 30000
## 24 29 2 3 41 2 39 1 87 3 10 17 27 4 3 233
## 30500 31000 32000 32400 33000 34000 35000 36000 36800 37000 38000 38200 39000 40000 41000 42000
## 8 8 30 3 20 4 63 13 2 4 19 4 8 162 2 5
## 42200 43500 44000 45000 45500 46000 47000 47500 48000 48400 49000 50000 52000 53000 54000 55000
## 3 3 8 31 3 3 7 2 18 3 13 245 7 5 5 17
## 56000 57000 58000 59000 60000 60600 61800 62000 62500 63000 64000 64600 65000 66000 66980 67000
## 10 3 1 4 91 4 3 6 2 4 4 3 17 8 4 5
## 68000 69000 70000 71000 73000 74000 75000 76000 78000 80000 82000 84000 84500 85000 88000 88888
## 4 2 56 2 9 3 29 3 6 56 3 5 1 17 3 4
## 89000 90000 90500 93000 96000 97000 99000 1e+05 104000 105000 108000 112000 115000 119000 120000 124000
## 2 22 2 2 2 5 4 128 4 3 3 1 2 3 25 3
## 125000 126000 128000 136000 138000 139000 140000 143000 147000 150000 157000 160000 165000 173000 195000 196000
## 4 5 2 3 2 4 11 3 1 21 2 5 2 1 4 5
## 2e+05 219000 220000 230000 240000 250000 260000 270000 280000 3e+05 316000 320000 330000 350000 4e+05 450000
## 22 5 2 8 1 13 3 2 2 19 2 5 1 6 10 2
## 5e+05 520000 6e+05 640000 650000 7e+05 888888 1e+06 1011000 1345000 1400000 1500000 1600000 1610000 3e+06 <NA>
## 2 2 9 1 1 8 40 4 2 2 2 2 3 2 1 27926
## [1] "Frequency table after encoding"
## loanval. Value Of All Loans Taken In Last Year
## 0 1 2 3 4 5 6 7 10
## 4471 12 15 7 7 2 1 3 4
## 11 16 60 75 88 100 110 200 250
## 3 4 2 7 2 8 1 14 6
## 300 350 400 500 600 700 750 760 800
## 9 6 10 56 2 7 10 6 5
## 1000 1200 1400 1500 1700 1750 2000 2200 2270
## 128 5 4 43 4 1 223 6 1
## 2300 2400 2500 2600 2700 3000 3500 3600 4000
## 2 3 15 3 3 149 12 2 111
## 4500 4600 4700 4800 5000 5200 5500 5800 6000
## 12 2 3 6 296 4 4 3 143
## 6500 7000 7500 7680 8000 8400 8500 9000 9500
## 1 77 6 1 114 3 2 86 18
## 9720 10000 10500 10800 11000 11200 11300 11500 12000
## 1 510 17 2 20 4 3 3 122
## 12500 12600 13000 13500 13600 14000 14400 14500 15000
## 4 4 38 3 4 43 4 3 249
## 15500 15600 16000 16500 16800 17000 17400 17500 18000
## 9 4 34 5 2 40 4 7 46
## 18400 18500 19000 19700 20000 20500 21000 22000 22500
## 2 4 11 4 361 1 24 29 2
## 22700 23000 23500 24000 24020 25000 25500 26000 27000
## 3 41 2 39 1 87 3 10 17
## 28000 29000 29400 30000 30500 31000 32000 32400 33000
## 27 4 3 233 8 8 30 3 20
## 34000 35000 36000 36800 37000 38000 38200 39000 40000
## 4 63 13 2 4 19 4 8 162
## 41000 42000 42200 43500 44000 45000 45500 46000 47000
## 2 5 3 3 8 31 3 3 7
## 47500 48000 48400 49000 50000 52000 53000 54000 55000
## 2 18 3 13 245 7 5 5 17
## 56000 57000 58000 59000 60000 60600 61800 62000 62500
## 10 3 1 4 91 4 3 6 2
## 63000 64000 64600 65000 66000 66980 67000 68000 69000
## 4 4 3 17 8 4 5 4 2
## 70000 71000 73000 74000 75000 76000 78000 80000 82000
## 56 2 9 3 29 3 6 56 3
## 84000 84500 85000 88000 88888 89000 90000 90500 93000
## 5 1 17 3 4 2 22 2 2
## 96000 97000 99000 1e+05 104000 105000 108000 112000 115000
## 2 5 4 128 4 3 3 1 2
## 119000 120000 124000 125000 126000 128000 136000 138000 139000
## 3 25 3 4 5 2 3 2 4
## 140000 143000 147000 150000 157000 160000 165000 173000 195000
## 11 3 1 21 2 5 2 1 4
## 196000 2e+05 219000 220000 230000 240000 250000 260000 270000
## 5 22 5 2 8 1 13 3 2
## 280000 3e+05 316000 320000 330000 350000 4e+05 or more 888888 <NA>
## 2 19 2 5 1 6 53 40 27926
percentile_99.5 <- percentile_checker("b_totalsave", missing=c(88888,888888))
mydata <- top_recode (variable="b_totalsave", break_point=percentile_99.5, missing=c(88888,888888))
## [1] "Frequency table before encoding"
## b_totalsave. b_Total Household Savings
## 0 15 20 30 50 73 88 99 200 300 400 500 720 800 1000 1200
## 2494 1 2 1 1 1 8 2 3 1 1 15 1 1 34 2
## 1400 1500 1600 1800 2000 2500 2530 3000 3200 3500 3600 3800 4000 4500 4800 5000
## 1 14 1 1 42 9 1 35 2 4 2 1 34 5 2 95
## 5500 6000 6500 6700 6800 7000 7400 7500 8000 8500 9000 9200 9400 9500 9600 10000
## 2 27 1 1 1 18 1 1 29 2 11 1 2 1 2 117
## 10100 11000 11250 11400 12000 12500 13000 14000 14500 15000 15200 15500 16000 16500 17000 17500
## 1 7 1 1 21 1 11 5 1 80 1 1 8 1 9 1
## 18000 19000 19500 20000 20800 21000 21500 22000 23000 23500 24000 24500 25000 25200 26000 26400
## 15 7 2 106 1 8 1 4 5 1 9 1 27 1 5 1
## 27000 28000 29000 30000 31000 32000 34000 35000 36000 38000 40000 41000 42000 43000 45000 48000
## 5 5 4 66 3 3 6 12 4 4 41 2 1 3 6 6
## 49000 50000 52000 53000 54000 55000 56000 57000 59000 60000 62000 64000 65000 67500 68000 69000
## 1 56 4 1 2 1 1 2 1 26 1 2 3 1 3 2
## 70000 70200 72000 75000 80000 82000 85000 88888 90000 92000 95000 1e+05 101000 108000 115000 117000
## 10 1 2 4 9 1 3 18 5 1 1 21 1 2 1 1
## 119000 120000 121000 122000 123000 125000 130000 145000 150000 152000 160000 168000 175000 180000 190000 2e+05
## 1 6 1 1 1 1 1 1 17 1 2 1 1 2 1 14
## 204000 228000 250000 260000 270000 280000 3e+05 315000 325000 340000 350000 375000 4e+05 420000 450000 480000
## 1 1 2 1 1 1 6 1 1 3 2 1 3 1 2 1
## 5e+05 504000 6e+05 620000 650000 7e+05 780000 8e+05 888888 9e+05 1e+06 1200000 1500000 2e+06 2500000 3e+06
## 6 1 4 1 2 2 1 3 11 1 2 1 1 3 1 2
## 1e+07 1.1e+07 3e+07 1.2e+08 <NA>
## 1 1 1 1 33902
## [1] "Frequency table after encoding"
## b_totalsave. b_Total Household Savings
## 0 15 20 30 50 73 88 99 200
## 2494 1 2 1 1 1 8 2 3
## 300 400 500 720 800 1000 1200 1400 1500
## 1 1 15 1 1 34 2 1 14
## 1600 1800 2000 2500 2530 3000 3200 3500 3600
## 1 1 42 9 1 35 2 4 2
## 3800 4000 4500 4800 5000 5500 6000 6500 6700
## 1 34 5 2 95 2 27 1 1
## 6800 7000 7400 7500 8000 8500 9000 9200 9400
## 1 18 1 1 29 2 11 1 2
## 9500 9600 10000 10100 11000 11250 11400 12000 12500
## 1 2 117 1 7 1 1 21 1
## 13000 14000 14500 15000 15200 15500 16000 16500 17000
## 11 5 1 80 1 1 8 1 9
## 17500 18000 19000 19500 20000 20800 21000 21500 22000
## 1 15 7 2 106 1 8 1 4
## 23000 23500 24000 24500 25000 25200 26000 26400 27000
## 5 1 9 1 27 1 5 1 5
## 28000 29000 30000 31000 32000 34000 35000 36000 38000
## 5 4 66 3 3 6 12 4 4
## 40000 41000 42000 43000 45000 48000 49000 50000 52000
## 41 2 1 3 6 6 1 56 4
## 53000 54000 55000 56000 57000 59000 60000 62000 64000
## 1 2 1 1 2 1 26 1 2
## 65000 67500 68000 69000 70000 70200 72000 75000 80000
## 3 1 3 2 10 1 2 4 9
## 82000 85000 88888 90000 92000 95000 1e+05 101000 108000
## 1 3 18 5 1 1 21 1 2
## 115000 117000 119000 120000 121000 122000 123000 125000 130000
## 1 1 1 6 1 1 1 1 1
## 145000 150000 152000 160000 168000 175000 180000 190000 2e+05
## 1 17 1 2 1 1 2 1 14
## 204000 228000 250000 260000 270000 280000 3e+05 315000 325000
## 1 1 2 1 1 1 6 1 1
## 340000 350000 375000 4e+05 420000 450000 480000 5e+05 504000
## 3 2 1 3 1 2 1 6 1
## 6e+05 620000 650000 7e+05 or more 888888 <NA>
## 4 1 2 21 11 33902
mydata <- top_recode (variable="b_loanval", break_point=10000000, missing=c(88888,888888, 88888888))
## [1] "Frequency table before encoding"
## b_loanval. b_Value Of All Loans Taken In Last Year
## 0 1 2 3 5 6 9 10 12 14 18
## 178 6 5 4 5 8 5 5 11 4 7
## 21 25 30 45 75 80 88 250 300 350 500
## 7 6 6 2 6 4 7 4 3 3 32
## 1000 1200 1400 1500 2000 2500 2600 2800 3000 3500 3800
## 121 10 5 45 168 22 6 3 246 17 9
## 4000 4500 4800 5000 5500 5800 6000 6400 6500 7000 7500
## 121 10 5 422 10 6 164 3 40 136 17
## 8000 8300 8500 9000 9500 10000 10500 11000 11500 11900 12000
## 145 6 5 83 11 691 12 36 13 7 142
## 12200 12500 13000 14000 15000 15500 15750 16000 16300 16500 17000
## 3 8 65 45 356 3 7 46 4 6 29
## 17500 18000 18500 19000 19500 20000 20200 21000 21500 21800 22000
## 15 74 5 12 4 615 7 30 3 5 59
## 22500 23000 24000 25000 26000 27000 28000 29000 30000 31000 31800
## 4 60 47 125 14 17 34 3 385 26 7
## 32000 33000 34000 35000 35500 36000 36500 37000 37500 38000 39000
## 29 6 12 91 3 20 5 22 4 32 14
## 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000
## 278 12 14 5 15 74 5 17 15 12 416
## 52000 53000 54000 55000 56000 56888 58000 59000 60000 61000 62000
## 19 6 23 34 24 7 7 7 80 4 5
## 62500 63000 64000 65000 67000 68000 69000 70000 72000 73000 75000
## 5 2 11 27 13 7 5 79 4 7 16
## 76000 78000 80000 83000 85000 88888 90000 94000 95000 96000 97000
## 4 6 110 5 21 23 28 4 17 6 5
## 98000 1e+05 105000 110000 112000 114000 118000 120000 123000 125000 126000
## 5 212 7 16 4 3 3 52 3 17 3
## 130000 134000 140000 150000 160000 170000 171000 175000 180000 183000 187000
## 20 8 15 65 5 3 6 3 34 5 6
## 190000 2e+05 215000 218000 230000 240000 250000 260000 270000 280000 3e+05
## 5 70 3 3 8 13 30 5 7 11 29
## 340000 351000 365000 4e+05 420350 441000 498000 5e+05 550000 8e+05 888888
## 6 6 4 20 2 6 9 20 3 4 33
## 9e+05 1e+06 1100000 1150000 1200000 2e+06 2005000 3e+06 8888888 1.1e+07 11500000
## 9 10 4 4 4 8 3 11 4 6 6
## 1.3e+07 Don't know <NA>
## 7 17 29980
## [1] "Frequency table after encoding"
## b_loanval. b_Value Of All Loans Taken In Last Year
## 0 1 2 3 5 6 9 10 12
## 178 6 5 4 5 8 5 5 11
## 14 18 21 25 30 45 75 80 88
## 4 7 7 6 6 2 6 4 7
## 250 300 350 500 1000 1200 1400 1500 2000
## 4 3 3 32 121 10 5 45 168
## 2500 2600 2800 3000 3500 3800 4000 4500 4800
## 22 6 3 246 17 9 121 10 5
## 5000 5500 5800 6000 6400 6500 7000 7500 8000
## 422 10 6 164 3 40 136 17 145
## 8300 8500 9000 9500 10000 10500 11000 11500 11900
## 6 5 83 11 691 12 36 13 7
## 12000 12200 12500 13000 14000 15000 15500 15750 16000
## 142 3 8 65 45 356 3 7 46
## 16300 16500 17000 17500 18000 18500 19000 19500 20000
## 4 6 29 15 74 5 12 4 615
## 20200 21000 21500 21800 22000 22500 23000 24000 25000
## 7 30 3 5 59 4 60 47 125
## 26000 27000 28000 29000 30000 31000 31800 32000 33000
## 14 17 34 3 385 26 7 29 6
## 34000 35000 35500 36000 36500 37000 37500 38000 39000
## 12 91 3 20 5 22 4 32 14
## 40000 41000 42000 43000 44000 45000 46000 47000 48000
## 278 12 14 5 15 74 5 17 15
## 49000 50000 52000 53000 54000 55000 56000 56888 58000
## 12 416 19 6 23 34 24 7 7
## 59000 60000 61000 62000 62500 63000 64000 65000 67000
## 7 80 4 5 5 2 11 27 13
## 68000 69000 70000 72000 73000 75000 76000 78000 80000
## 7 5 79 4 7 16 4 6 110
## 83000 85000 88888 90000 94000 95000 96000 97000 98000
## 5 21 23 28 4 17 6 5 5
## 1e+05 105000 110000 112000 114000 118000 120000 123000 125000
## 212 7 16 4 3 3 52 3 17
## 126000 130000 134000 140000 150000 160000 170000 171000 175000
## 3 20 8 15 65 5 3 6 3
## 180000 183000 187000 190000 2e+05 215000 218000 230000 240000
## 34 5 6 5 70 3 3 8 13
## 250000 260000 270000 280000 3e+05 340000 351000 365000 4e+05
## 30 5 7 11 29 6 6 4 20
## 420350 441000 498000 5e+05 550000 8e+05 888888 9e+05 1e+06
## 2 6 9 20 3 4 33 9 10
## 1100000 1150000 1200000 2e+06 2005000 3e+06 8888888 1e+07 or more Don't know
## 4 4 4 8 3 11 4 19 17
## <NA>
## 29980
percentile_99.5 <- percentile_checker("e_save1", missing=c(88888))
mydata <- top_recode (variable="e_save1", break_point=percentile_99.5, missing=88888)
## [1] "Frequency table before encoding"
## e_save1. e_Household Savings Amount In The Last Month
## 0 100 200 300 400 500 600 800 1000 1200 1300 1400 1500 1600 1800 2000 2200 2300
## 15219 8 116 19 84 152 64 37 523 29 5 4 106 3 11 568 6 8
## 2400 2500 2800 3000 3200 3500 3600 3700 3800 4000 4500 4600 5000 5500 5600 6000 7000 7500
## 6 66 5 278 8 46 9 5 5 250 14 3 490 5 4 97 51 6
## 8000 9000 9500 10000 11000 11500 11600 12000 13000 14000 15000 16000 17500 18000 19000 20000 21000 24000
## 67 11 5 240 15 6 2 14 20 13 88 13 4 13 5 212 5 27
## 25000 26000 27000 30000 35000 36000 40000 42000 44000 45000 50000 52000 60000 70000 72000 75000 80000 88888
## 22 6 8 98 38 4 19 5 4 6 100 7 17 13 4 3 9 33
## 1e+05 120000 121000 130000 150000 2e+05 5e+05 <NA>
## 46 9 5 6 13 13 5 18202
## [1] "Frequency table after encoding"
## e_save1. e_Household Savings Amount In The Last Month
## 0 100 200 300 400 500 600 800 1000
## 15219 8 116 19 84 152 64 37 523
## 1200 1300 1400 1500 1600 1800 2000 2200 2300
## 29 5 4 106 3 11 568 6 8
## 2400 2500 2800 3000 3200 3500 3600 3700 3800
## 6 66 5 278 8 46 9 5 5
## 4000 4500 4600 5000 5500 5600 6000 7000 7500
## 250 14 3 490 5 4 97 51 6
## 8000 9000 9500 10000 11000 11500 11600 12000 13000
## 67 11 5 240 15 6 2 14 20
## 14000 15000 16000 17500 18000 19000 20000 21000 24000
## 13 88 13 4 13 5 212 5 27
## 25000 26000 27000 30000 35000 36000 40000 42000 44000
## 22 6 8 98 38 4 19 5 4
## 45000 50000 52000 60000 70000 72000 75000 80000 or more 88888
## 6 100 7 17 13 4 3 106 33
## <NA>
## 18202
percentile_99.5 <- percentile_checker("e_saveall", missing=c(88888))
mydata <- top_recode (variable="e_saveall", break_point=percentile_99.5, missing=88888)
## [1] "Frequency table before encoding"
## e_saveall. e_Total Household Savings
## 0 1 9 100 200 300 400 450 500 600 800 900 1000 1200 1300 1400
## 13984 8 7 5 86 24 45 5 131 59 38 15 382 21 5 21
## 1500 1600 1800 2000 2200 2300 2400 2500 2600 2800 2900 3000 3200 3400 3500 3600
## 140 8 11 499 6 8 6 56 6 11 5 310 13 14 75 8
## 3800 4000 4300 4500 5000 5500 5600 5800 6000 6500 7000 7500 7600 7700 8000 8500
## 5 165 8 11 553 5 4 4 130 9 77 18 3 5 121 5
## 8888 9000 9500 10000 10500 11000 11500 11600 12000 12800 13000 13600 14000 14500 15000 16000
## 3 32 5 428 9 38 12 2 82 4 46 4 34 5 211 22
## 17200 17500 18000 19000 19300 20000 21000 22000 24000 25000 26000 27000 30000 31000 32000 33500
## 5 4 33 4 6 273 15 6 17 59 5 8 164 5 13 3
## 34000 35000 36000 37000 39000 40000 42000 43000 44000 45000 46000 50000 52000 60000 65000 70000
## 3 31 4 6 7 56 5 6 6 34 11 143 7 47 5 21
## 74000 80000 85000 88888 89000 90000 94000 95000 1e+05 106000 108000 120000 121000 125000 130000 140000
## 5 21 5 83 5 3 5 5 92 6 7 29 5 7 8 6
## 142000 144000 150000 160000 172000 175000 2e+05 250000 3e+05 320000 350000 450000 5e+05 566000 6e+05 8e+05
## 7 6 47 5 4 4 12 15 12 4 6 10 18 6 6 3
## 1e+06 1500000 2500000 3e+06 <NA>
## 4 5 5 8 18202
## [1] "Frequency table after encoding"
## e_saveall. e_Total Household Savings
## 0 1 9 100 200 300 400 450
## 13984 8 7 5 86 24 45 5
## 500 600 800 900 1000 1200 1300 1400
## 131 59 38 15 382 21 5 21
## 1500 1600 1800 2000 2200 2300 2400 2500
## 140 8 11 499 6 8 6 56
## 2600 2800 2900 3000 3200 3400 3500 3600
## 6 11 5 310 13 14 75 8
## 3800 4000 4300 4500 5000 5500 5600 5800
## 5 165 8 11 553 5 4 4
## 6000 6500 7000 7500 7600 7700 8000 8500
## 130 9 77 18 3 5 121 5
## 8888 9000 9500 10000 10500 11000 11500 11600
## 3 32 5 428 9 38 12 2
## 12000 12800 13000 13600 14000 14500 15000 16000
## 82 4 46 4 34 5 211 22
## 17200 17500 18000 19000 19300 20000 21000 22000
## 5 4 33 4 6 273 15 6
## 24000 25000 26000 27000 30000 31000 32000 33500
## 17 59 5 8 164 5 13 3
## 34000 35000 36000 37000 39000 40000 42000 43000
## 3 31 4 6 7 56 5 6
## 44000 45000 46000 50000 52000 60000 65000 70000
## 6 34 11 143 7 47 5 21
## 74000 80000 85000 88888 89000 90000 94000 95000
## 5 21 5 83 5 3 5 5
## 1e+05 106000 108000 120000 121000 125000 130000 140000
## 92 6 7 29 5 7 8 6
## 142000 144000 150000 160000 172000 175000 2e+05 250000 or more
## 7 6 47 5 4 4 12 102
## <NA>
## 18202
percentile_99.5 <- percentile_checker("e_loanval", missing=c(88888,888888))
mydata <- top_recode (variable="e_loanval", break_point=percentile_99.5, missing=c(88888,888888))
## [1] "Frequency table before encoding"
## e_loanval. e_Value Of All Loans Taken In Last Year
## 1 2 3 4 5 6 7 10 11 16 60 75 88 100 110 200
## 32 33 13 12 6 4 5 9 6 7 4 14 4 13 4 25
## 250 300 350 400 500 600 700 750 760 800 1000 1200 1400 1500 1700 1750
## 12 18 9 18 102 6 17 20 8 10 224 10 7 79 9 2
## 2000 2200 2270 2300 2400 2500 2600 2700 3000 3500 3600 4000 4500 4600 4700 4800
## 441 11 3 4 7 29 6 5 275 23 6 210 32 6 5 10
## 5000 5200 5500 5800 6000 6500 7000 7500 7680 8000 8400 8500 9000 9500 9720 10000
## 573 6 9 4 289 2 157 15 4 222 7 6 154 33 5 1017
## 10500 10800 11000 11200 11300 11500 12000 12500 12600 13000 13500 13600 14000 14400 14500 15000
## 29 6 43 6 4 6 231 10 5 78 5 7 87 6 4 495
## 15500 15600 16000 16500 16800 17000 17400 17500 18000 18400 18500 19000 19700 20000 20500 21000
## 13 7 75 13 6 79 6 16 87 4 8 19 6 744 3 46
## 22000 22500 22700 23000 23500 24000 24020 25000 25500 26000 27000 28000 29000 29400 30000 30500
## 66 3 6 86 5 68 3 178 5 15 36 54 8 8 455 16
## 31000 32000 32400 33000 34000 35000 36000 36800 37000 38000 38200 39000 40000 41000 42000 42200
## 12 62 5 38 10 128 27 4 7 34 6 14 315 6 8 4
## 43500 44000 45000 45500 46000 47000 47500 48000 48400 49000 50000 52000 53000 54000 55000 56000
## 5 14 64 5 7 17 5 31 5 26 498 19 9 9 32 19
## 57000 58000 59000 60000 60600 61800 62000 62500 63000 64000 64600 65000 66000 66980 67000 68000
## 5 3 7 189 7 5 15 6 10 6 5 29 14 6 7 7
## 69000 70000 71000 73000 74000 75000 76000 78000 80000 82000 84000 84500 85000 88000 88888 89000
## 5 109 3 18 5 57 8 13 103 7 10 3 31 7 6 5
## 90000 90500 93000 96000 97000 99000 1e+05 104000 105000 108000 112000 115000 119000 120000 124000 125000
## 35 5 3 5 9 8 261 6 8 4 3 5 5 42 4 6
## 126000 128000 136000 138000 139000 140000 143000 147000 150000 157000 160000 165000 173000 195000 196000 2e+05
## 8 5 5 5 7 18 8 3 55 7 10 5 4 7 8 47
## 219000 220000 230000 240000 250000 260000 270000 280000 3e+05 316000 320000 330000 350000 4e+05 450000 5e+05
## 7 7 17 3 25 6 5 4 44 7 9 4 13 18 4 10
## 520000 6e+05 640000 650000 7e+05 888888 1e+06 1011000 1345000 1400000 1500000 1600000 1610000 3e+06 <NA>
## 5 17 4 5 18 81 9 4 5 5 4 6 5 8 27155
## [1] "Frequency table after encoding"
## e_loanval. e_Value Of All Loans Taken In Last Year
## 1 2 3 4 5 6 7 10 11
## 32 33 13 12 6 4 5 9 6
## 16 60 75 88 100 110 200 250 300
## 7 4 14 4 13 4 25 12 18
## 350 400 500 600 700 750 760 800 1000
## 9 18 102 6 17 20 8 10 224
## 1200 1400 1500 1700 1750 2000 2200 2270 2300
## 10 7 79 9 2 441 11 3 4
## 2400 2500 2600 2700 3000 3500 3600 4000 4500
## 7 29 6 5 275 23 6 210 32
## 4600 4700 4800 5000 5200 5500 5800 6000 6500
## 6 5 10 573 6 9 4 289 2
## 7000 7500 7680 8000 8400 8500 9000 9500 9720
## 157 15 4 222 7 6 154 33 5
## 10000 10500 10800 11000 11200 11300 11500 12000 12500
## 1017 29 6 43 6 4 6 231 10
## 12600 13000 13500 13600 14000 14400 14500 15000 15500
## 5 78 5 7 87 6 4 495 13
## 15600 16000 16500 16800 17000 17400 17500 18000 18400
## 7 75 13 6 79 6 16 87 4
## 18500 19000 19700 20000 20500 21000 22000 22500 22700
## 8 19 6 744 3 46 66 3 6
## 23000 23500 24000 24020 25000 25500 26000 27000 28000
## 86 5 68 3 178 5 15 36 54
## 29000 29400 30000 30500 31000 32000 32400 33000 34000
## 8 8 455 16 12 62 5 38 10
## 35000 36000 36800 37000 38000 38200 39000 40000 41000
## 128 27 4 7 34 6 14 315 6
## 42000 42200 43500 44000 45000 45500 46000 47000 47500
## 8 4 5 14 64 5 7 17 5
## 48000 48400 49000 50000 52000 53000 54000 55000 56000
## 31 5 26 498 19 9 9 32 19
## 57000 58000 59000 60000 60600 61800 62000 62500 63000
## 5 3 7 189 7 5 15 6 10
## 64000 64600 65000 66000 66980 67000 68000 69000 70000
## 6 5 29 14 6 7 7 5 109
## 71000 73000 74000 75000 76000 78000 80000 82000 84000
## 3 18 5 57 8 13 103 7 10
## 84500 85000 88000 88888 89000 90000 90500 93000 96000
## 3 31 7 6 5 35 5 3 5
## 97000 99000 1e+05 104000 105000 108000 112000 115000 119000
## 9 8 261 6 8 4 3 5 5
## 120000 124000 125000 126000 128000 136000 138000 139000 140000
## 42 4 6 8 5 5 5 7 18
## 143000 147000 150000 157000 160000 165000 173000 195000 196000
## 8 3 55 7 10 5 4 7 8
## 2e+05 219000 220000 230000 240000 250000 260000 270000 280000
## 47 7 7 17 3 25 6 5 4
## 3e+05 316000 320000 330000 350000 4e+05 450000 5e+05 520000
## 44 7 9 4 13 18 4 10 5
## 6e+05 640000 650000 7e+05 or more 888888 <NA>
## 17 4 5 64 81 27155
percentile_99.5 <- percentile_checker("b_sav", missing=c(88888,888888888))
mydata <- top_recode (variable="b_sav", break_point=percentile_99.5, missing=c(88888,888888888))
## [1] "Frequency table before encoding"
## b_sav. b_(Save) Now, I Have Some Questions About Your Household. How Much Did Your Hous
## 0 1 10 18 24 88 100 200 250 300 400 500 522
## 10646 4 6 6 4 40 33 47 4 20 32 234 3
## 600 700 720 750 800 900 1000 1008 1100 1200 1400 1500 1600
## 27 9 5 6 15 7 582 4 3 11 6 93 5
## 1750 1800 2000 2200 2400 2500 3000 3500 4000 4500 4600 5000 5200
## 6 10 736 6 10 122 344 55 319 34 4 776 4
## 5500 5600 6000 6200 6300 6500 7000 7400 7500 8000 8250 9000 9500
## 8 5 206 5 4 7 125 6 7 203 6 62 10
## 10000 10500 11000 11500 12000 12500 13000 14000 14400 15000 16000 17000 18000
## 595 12 27 5 95 6 24 52 3 193 17 19 52
## 19000 20000 21000 22000 23000 23500 24000 25000 26000 26500 27000 28000 30000
## 11 503 6 26 18 5 32 152 13 4 5 12 218
## 31000 32000 33000 35000 36000 39000 40000 42000 45000 48000 50000 51000 52000
## 6 8 3 31 3 15 169 5 38 6 222 3 9
## 55000 60000 62000 65000 69000 70000 80000 85000 86000 88888 90000 95000 1e+05
## 7 76 4 5 6 19 39 4 3 26 4 6 59
## 120000 123000 125000 130000 150000 152000 160000 178000 190000 2e+05 228000 230000 240000
## 35 6 5 12 40 7 5 7 5 45 6 4 6
## 250000 270000 3e+05 330000 340000 346000 350000 4e+05 450000 5e+05 6e+05 780000 850000
## 14 3 6 7 4 5 3 18 10 18 3 6 6
## 1200000 1900000 2500000 3e+06 5e+06 2.5e+07 888888888 <NA>
## 7 4 9 9 7 3 64 19563
## [1] "Frequency table after encoding"
## b_sav. b_(Save) Now, I Have Some Questions About Your Household. How Much Did Your Hous
## 0 1 10 18 24 88 100 200 250
## 10646 4 6 6 4 40 33 47 4
## 300 400 500 522 600 700 720 750 800
## 20 32 234 3 27 9 5 6 15
## 900 1000 1008 1100 1200 1400 1500 1600 1750
## 7 582 4 3 11 6 93 5 6
## 1800 2000 2200 2400 2500 3000 3500 4000 4500
## 10 736 6 10 122 344 55 319 34
## 4600 5000 5200 5500 5600 6000 6200 6300 6500
## 4 776 4 8 5 206 5 4 7
## 7000 7400 7500 8000 8250 9000 9500 10000 10500
## 125 6 7 203 6 62 10 595 12
## 11000 11500 12000 12500 13000 14000 14400 15000 16000
## 27 5 95 6 24 52 3 193 17
## 17000 18000 19000 20000 21000 22000 23000 23500 24000
## 19 52 11 503 6 26 18 5 32
## 25000 26000 26500 27000 28000 30000 31000 32000 33000
## 152 13 4 5 12 218 6 8 3
## 35000 36000 39000 40000 42000 45000 48000 50000 51000
## 31 3 15 169 5 38 6 222 3
## 52000 55000 60000 62000 65000 69000 70000 80000 85000
## 9 7 76 4 5 6 19 39 4
## 86000 88888 90000 95000 1e+05 120000 123000 125000 130000
## 3 26 4 6 59 35 6 5 12
## 150000 152000 160000 178000 190000 2e+05 228000 230000 240000
## 40 7 5 7 5 45 6 4 6
## 250000 270000 3e+05 330000 340000 346000 350000 4e+05 or more 888888888
## 14 3 6 7 4 5 3 100 64
## <NA>
## 19563
percentile_99.5 <- percentile_checker("b_earn_a_", missing=c(8888,66666,88888))
mydata <- top_recode (variable="b_earn_a_", break_point=percentile_99.5, missing=c(8888,66666,88888))
## [1] "Frequency table before encoding"
## b_earn_a_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 3 8 17 100 120 150 200 300 350 400 500 550 600 1000 1500 1850 2000 2500 3000
## 110 1 1 1 2 3 2 1 8 1 1 3 4 1 1 2 2 1 3 2 3
## 3500 4000 5000 6000 8000 8888 9000 20000 25000 66666 88888 <NA>
## 1 1 1 3 2 2 1 1 1 8 3 37588
## [1] "Frequency table after encoding"
## b_earn_a_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 3 8 17 100 120 150 200
## 110 1 1 1 2 3 2 1 8
## 300 350 400 500 550 600 1000 1500 1850
## 1 1 3 4 1 1 2 2 1
## 2000 2500 3000 3500 4000 5000 6000 8000 8888
## 3 2 3 1 1 1 3 2 2
## 9000 20000 20925 or more 66666 88888 <NA>
## 1 1 1 8 3 37588
percentile_99.5 <- percentile_checker("b_earn_b_", missing=c(8888,66666,88888))
mydata <- top_recode (variable="b_earn_b_", break_point=percentile_99.5, missing=c(8888,66666,88888))
## [1] "Frequency table before encoding"
## b_earn_b_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 3 5 200 1500 1800 2000 3000 3008 4000 5000 6000 66666 88888 <NA>
## 85 2 1 2 1 1 4 3 1 1 1 1 7 3 37652
## [1] "Frequency table after encoding"
## b_earn_b_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 3 5 200 1500 1800 2000 3000 3008 4000
## 85 2 1 2 1 1 4 3 1 1
## 5000 5489 or more 66666 88888 <NA>
## 1 1 7 3 37652
percentile_99.5 <- percentile_checker("b_earn_c_", missing=c(6666,8888,66666,88888,666666,888888))
mydata <- top_recode (variable="b_earn_c_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888))
## [1] "Frequency table before encoding"
## b_earn_c_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 6 9 10 22 50 100 150 200 240 250 300 500 700 800
## 841 4 2 1 3 3 1 1 1 5 1 8 1 4 2 8 2 2
## 1000 1500 2000 3000 5000 6000 66666 88888 666666 <NA>
## 3 1 1 3 1 1 17 7 1 36840
## [1] "Frequency table after encoding"
## b_earn_c_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 6 9 10 22 50 100
## 841 4 2 1 3 3 1 1 1 5
## 150 200 240 250 300 500 700 800 1000 1500
## 1 8 1 4 2 8 2 2 3 1
## 2000 2504 or more 66666 88888 666666 <NA>
## 1 5 17 7 1 36840
percentile_99.5 <- percentile_checker("b_earn_d_", missing=c(6666,8888,66666,88888,666666,888888))
mydata <- top_recode (variable="b_earn_d_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888))
## [1] "Frequency table before encoding"
## b_earn_d_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 6 50 100 150 300 500 550 1500 6666 8888 66666 88888 666666 888888 <NA>
## 549 1 1 2 1 1 2 2 1 2 1 3 55 17 3 1 37123
## [1] "Frequency table after encoding"
## b_earn_d_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 6 50 100 150 300 500 509 or more 6666
## 549 1 1 2 1 1 2 2 3 1
## 8888 66666 88888 666666 888888 <NA>
## 3 55 17 3 1 37123
percentile_99.5 <- percentile_checker("b_earn_e_", missing=c(6666,8888,66666,88888,666666,888888))
mydata <- top_recode (variable="b_earn_e_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888))
## [1] "Frequency table before encoding"
## b_earn_e_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 20 400 1000 1500 2500 66666 <NA>
## 28 1 1 1 1 1 1 3 37728
## [1] "Frequency table after encoding"
## b_earn_e_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 20 400 1000 1500 2335 or more 66666 <NA>
## 28 1 1 1 1 1 1 3 37728
percentile_99.5 <- percentile_checker("b_earn_f_", missing=c(6666,8888,66666,88888,666666,888888))
mydata <- top_recode (variable="b_earn_f_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888))
## [1] "Frequency table before encoding"
## b_earn_f_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 20 100 200 300 600 66666 88888 666666 <NA>
## 334 1 1 1 1 1 20 3 1 37402
## [1] "Frequency table after encoding"
## b_earn_f_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 20 100 200 231 or more 66666 88888 666666 <NA>
## 334 1 1 1 2 20 3 1 37402
percentile_99.5 <- percentile_checker("b_earn_h_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_h_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_h_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 4 5 6 7 10 30 50 100 150
## 3683 15 4 2 2 1 1 1 1 1 3 5 4
## 200 250 300 400 500 600 800 1000 1500 2000 3000 6666 8888
## 6 2 6 1 10 1 1 4 2 1 1 1 2
## 66666 88888 666666 666666666 <NA>
## 161 51 4 3 33785
## [1] "Frequency table after encoding"
## b_earn_h_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 4 5 6 7 10 30
## 3683 15 4 2 2 1 1 1 1 1
## 50 100 150 200 250 300 400 500 or more 6666 8888
## 3 5 4 6 2 6 1 20 1 2
## 66666 88888 666666 666666666 <NA>
## 161 51 4 3 33785
percentile_99.5 <- percentile_checker("b_earn_i_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_i_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_i_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 4 5 8 30 50 100 150 200 300 400 500 600 650 700
## 886 3 3 2 3 2 2 1 2 1 2 2 2 2 10 3 1 3
## 800 1000 1200 1500 3000 8888 66666 88888 666666 <NA>
## 1 4 1 1 1 1 53 20 1 36752
## [1] "Frequency table after encoding"
## b_earn_i_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 3 4 5 8 30 50 100
## 886 3 3 2 3 2 2 1 2 1
## 150 200 300 400 500 600 650 700 800 1000 or more
## 2 2 2 2 10 3 1 3 1 7
## 8888 66666 88888 666666 <NA>
## 1 53 20 1 36752
percentile_99.5 <- percentile_checker("b_earn_j_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_j_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_j_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 200 350 400 500 700 800 900 1200 1500 3500 66666 88888 <NA>
## 354 1 1 2 1 1 9 1 2 1 1 1 1 7 2 37380
## [1] "Frequency table after encoding"
## b_earn_j_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 1 2 200 350 400 500 700 800 900
## 354 1 1 2 1 1 9 1 2 1
## 1200 1237 or more 66666 88888 <NA>
## 1 2 7 2 37380
percentile_99.5 <- percentile_checker("b_earn_l_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_l_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_l_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 40 150 200 300 350 400 500 600 1000 1500 2000 2500
## 553 1 1 3 1 1 2 3 4 9 1 4 2
## 3000 4000 5000 6000 7000 9000 66666 88888 666666666 <NA>
## 4 1 1 2 1 3 5 4 5 37154
## [1] "Frequency table after encoding"
## b_earn_l_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 40 150 200 300 350 400 500 600 1000
## 553 1 1 3 1 1 2 3 4 9
## 1500 2000 2500 3000 4000 5000 6000 7000 7039 or more 66666
## 1 4 2 4 1 1 2 1 3 5
## 88888 666666666 <NA>
## 4 5 37154
percentile_99.5 <- percentile_checker("b_earn_o_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_o_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_o_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 3 100 150 350 1000 2000 5000 66666 <NA>
## 48 1 1 1 1 1 1 1 1 37709
## [1] "Frequency table after encoding"
## b_earn_o_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 3 100 150 350 1000 2000 4189 or more 66666 <NA>
## 48 1 1 1 1 1 1 1 1 37709
percentile_99.5 <- percentile_checker("b_earn_p_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_p_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_p_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 20 50 100 200 250 300 350 700 1000 1500 3000 7000 66666 <NA>
## 21 1 1 1 1 1 2 1 2 1 1 1 1 1 5 37724
## [1] "Frequency table after encoding"
## b_earn_p_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 20 50 100 200 250 300 350 700
## 21 1 1 1 1 1 2 1 2 1
## 1000 1500 3000 6300 or more 66666 <NA>
## 1 1 1 1 5 37724
percentile_99.5 <- percentile_checker("b_earn_q_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_q_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_q_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 5 50 100 200 250 300 500 66666 88888 666666 <NA>
## 586 2 1 1 2 4 1 1 2 34 5 1 37125
## [1] "Frequency table after encoding"
## b_earn_q_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 2 5 50 100 200 250 or more 66666 88888 666666
## 586 2 1 1 2 4 4 34 5 1
## <NA>
## 37125
percentile_99.5 <- percentile_checker("b_earn_r_", missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
mydata <- top_recode (variable="b_earn_r_", break_point=percentile_99.5, missing=c(6666,8888,66666,88888,666666,888888,666666666,888888888))
## [1] "Frequency table before encoding"
## b_earn_r_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 4 100 250 400 500 1000 1500 3500 <NA>
## 128 1 1 1 1 1 2 1 1 37628
## [1] "Frequency table after encoding"
## b_earn_r_. b_How Much Did <<Name>> Earn From <<Emp>>In A Last Week?
## 0 4 100 250 400 500 1000 1500 2139 or more <NA>
## 128 1 1 1 1 1 2 1 1 37628
mydata$e_hhroster_count<- as.numeric(mydata$e_hhroster_count)
hist(mydata$e_hhroster_count)
mydata <- top_recode (variable="e_hhroster_count", break_point=10, missing=NA)
## [1] "Frequency table before encoding"
## e_hhroster_count.
## 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 154 910 2678 4169 4500 3508 1995 942 387 210 88 10 12 18202
## [1] "Frequency table after encoding"
## e_hhroster_count. 10
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 154 910 2678 4169 4500 3508 1995 942 707 18202
mydata$e_eligiblechild_<- as.numeric(mydata$e_eligiblechild_)
hist(mydata$e_eligiblechild_)
mydata2 <- top_recode (variable="e_eligiblechild_", break_point=10, missing=NA)
## [1] "Frequency table before encoding"
## e_eligiblechild_.
## 2 3 4 5 6 7 8 9 10 11 12 <NA>
## 463 2472 2273 1493 808 332 117 39 18 9 3 29738
## [1] "Frequency table after encoding"
## e_eligiblechild_. 10
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 463 2472 2273 1493 808 332 117 39 30 29738
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("schdays_",
"ownfarm_",
"homehrs_",
"hrsun",
"hrmon",
"hrtues",
"hrwed",
"hrthur",
"hrfri",
"hrsat",
"bhrsun",
"bhrmon",
"bhrtues",
"bhrwed",
"bhrthur",
"bhrfri",
"bhrsat",
"chrsun",
"chrmon",
"chrtues",
"chrwed",
"chrthur",
"chrfri",
"chrsat",
"dhrsun",
"dhrmon",
"dhrtues",
"dhrwed",
"dhrthur",
"dhrfri",
"dhrsat",
"ehrsun",
"ehrmon",
"ehrtues",
"ehrwed",
"ehrthur",
"ehrfri",
"ehrsat",
"fhrsun",
"fhrmon",
"fhrtues",
"fhrwed",
"fhrthur",
"fhrfri",
"fhrsat",
"ghrsun",
"ghrmon",
"ghrtues",
"ghrwed",
"ghrthur",
"ghrfri",
"ghrsat",
"hhrsun",
"hhrmon",
"hhrtues",
"hhrwed",
"hhrthur",
"hhrfri",
"hhrsat",
"ihrsun",
"ihrmon",
"ihrtues",
"ihrwed",
"ihrthur",
"ihrfri",
"ihrsat",
"jhrsun",
"jhrmon",
"jhrtues",
"jhrwed",
"jhrthur",
"jhrfri",
"jhrsat",
"khrsun",
"khrmon",
"khrtues",
"khrwed",
"khrthur",
"khrfri",
"khrsat",
"lhrsun",
"lhrmon",
"lhrtues",
"lhrwed",
"lhrthur",
"lhrfri",
"lhrsat",
"mhrsun",
"mhrmon",
"mhrtues",
"mhrwed",
"mhrthur",
"mhrfri",
"mhrsat",
"nhrsun",
"nhrmon",
"nhrtues",
"nhrwed",
"nhrthur",
"nhrfri",
"nhrsat",
"ohrsun",
"ohrmon",
"ohrtues",
"ohrwed",
"ohrthur",
"ohrfri",
"ohrsat",
"phrsun",
"phrmon",
"phrtues",
"phrwed",
"phrthur",
"phrfri",
"phrsat",
"qhrsun",
"qhrmon",
"qhrtues",
"qhrwed",
"qhrthur",
"qhrfri",
"qhrsat",
"rhrsun",
"rhrmon",
"rhrtues",
"rhrwed",
"rhrthur",
"rhrfri",
"rhrsat",
"thrsun",
"thrmon",
"thrtues",
"thrwed",
"thrthur",
"thrfri",
"thrsat",
"othhaz_a_",
"exp_e_",
"religion_",
"yardwork",
"bird",
"firewood",
"alcohol",
"brick",
"hrsworked",
"latehrs",
"b_preprimary",
"b_yardwork",
"b_bird",
"b_bar",
"b_alcohol",
"b_hrsun",
"b_hrmon",
"b_hrtue",
"b_hrwed",
"b_hrthur",
"b_hrfri",
"b_hrsat",
"b_hrsworked",
"b_sex_",
"b_currentenroll",
"b_schdays_",
"b_sex_hh",
"e_preprimary",
"e_yardwork",
"e_bird",
"e_firewood",
"e_alcohol",
"e_brick",
"e_hrsworked",
"e_latehrs",
"e_schdays_",
"b_nenroll_",
"b_grade_",
"b_scheme_",
"b_schmtyp_",
"b_schmsrc_",
"b_emp_b_",
"b_emp_e_",
"b_emp_g_",
"b_emp_h_",
"b_emp_i_",
"b_emp_j_",
"b_emp_m_masked",
"b_emp_n_",
"b_emp_o_",
"b_emp_p_",
"b_emp_q_",
"b_emp_r_",
"b_ownfarm_",
"b_tobowntime_",
"b_tobactivity_a_",
"b_tobactivity_b_",
"b_tobactivity_c_",
"b_tobactivity_d_",
"b_tobactivity_e_",
"b_tobactivity_i_",
"b_homehrs_",
"b_tobhrsown_a_",
"b_tobhrsown_b_",
"b_expo_",
"b_time_",
"b_adulttob_",
"b_reli_",
"b_savings",
"b_savingsall",
"b_vslawho",
"b_vslause",
"b_loanwhere",
"b_loanuse",
"b_fs1",
"b_water",
"b_inc1a_wsh_masked",
"b_inc1a_wsh_masked",
"b_inc1a_pmp_masked",
"b_inc2",
"b_inc2_number",
"b_inc3a",
"b_els_a_",
"b_els_b_",
"b_els_c_",
"b_els_d_",
"b_els_e_",
"b_els_f_",
"b_els_g_",
"b_els_h_",
"b_els_i_",
"b_els_j_",
"b_els_k_",
"b_els_l_",
"b_els_m_",
"b_els_n_",
"b_els_o_",
"b_els_p_",
"b_els_q_",
"b_els_r_",
"b_time_hrs_sun_a_",
"b_time_hrs_sun_b_",
"b_time_hrs_sun_c_",
"b_time_hrs_sun_d_",
"b_time_hrs_sun_e_",
"b_time_hrs_sun_f_",
"b_time_hrs_sun_g_",
"b_time_hrs_sun_h_",
"b_time_hrs_sun_i_",
"b_time_hrs_sun_j_",
"b_time_hrs_sun_k_",
"b_time_hrs_sun_l_",
"b_time_hrs_sun_m_",
"b_time_hrs_sun_n_",
"b_time_hrs_sun_o_",
"b_time_hrs_sun_p_",
"b_time_hrs_sun_q_",
"b_time_hrs_sun_r_",
"b_time_min_sun_a_",
"b_time_min_sun_b_",
"b_time_min_sun_c_",
"b_time_min_sun_d_",
"b_time_min_sun_e_",
"b_time_min_sun_f_",
"b_time_min_sun_g_",
"b_time_min_sun_h_",
"b_time_min_sun_i_",
"b_time_min_sun_j_",
"b_time_min_sun_k_",
"b_time_min_sun_l_",
"b_time_min_sun_m_",
"b_time_min_sun_n_",
"b_time_min_sun_o_",
"b_time_min_sun_p_",
"b_time_min_sun_q_",
"b_time_min_sun_r_",
"b_time_hrs_mon_a_",
"b_time_hrs_mon_b_",
"b_time_hrs_mon_c_",
"b_time_hrs_mon_d_",
"b_time_hrs_mon_e_",
"b_time_hrs_mon_f_",
"b_time_hrs_mon_g_",
"b_time_hrs_mon_h_",
"b_time_hrs_mon_i_",
"b_time_hrs_mon_j_",
"b_time_hrs_mon_k_",
"b_time_hrs_mon_l_",
"b_time_hrs_mon_m_",
"b_time_hrs_mon_n_",
"b_time_hrs_mon_o_",
"b_time_hrs_mon_p_",
"b_time_hrs_mon_q_",
"b_time_hrs_mon_r_",
"b_time_min_mon_a_",
"b_time_min_mon_b_",
"b_time_min_mon_c_",
"b_time_min_mon_d_",
"b_time_min_mon_e_",
"b_time_min_mon_f_",
"b_time_min_mon_g_",
"b_time_min_mon_h_",
"b_time_min_mon_i_",
"b_time_min_mon_j_",
"b_time_min_mon_k_",
"b_time_min_mon_l_",
"b_time_min_mon_m_",
"b_time_min_mon_n_",
"b_time_min_mon_o_",
"b_time_min_mon_p_",
"b_time_min_mon_q_",
"b_time_min_mon_r_",
"b_time_hrs_tue_a_",
"b_time_hrs_tue_b_",
"b_time_hrs_tue_c_",
"b_time_hrs_tue_d_",
"b_time_hrs_tue_e_",
"b_time_hrs_tue_f_",
"b_time_hrs_tue_g_",
"b_time_hrs_tue_h_",
"b_time_hrs_tue_i_",
"b_time_hrs_tue_j_",
"b_time_hrs_tue_k_",
"b_time_hrs_tue_l_",
"b_time_hrs_tue_m_",
"b_time_hrs_tue_n_",
"b_time_hrs_tue_o_",
"b_time_hrs_tue_p_",
"b_time_hrs_tue_q_",
"b_time_hrs_tue_r_",
"b_time_min_tue_a_",
"b_time_min_tue_b_",
"b_time_min_tue_c_",
"b_time_min_tue_d_",
"b_time_min_tue_e_",
"b_time_min_tue_f_",
"b_time_min_tue_g_",
"b_time_min_tue_h_",
"b_time_min_tue_i_",
"b_time_min_tue_j_",
"b_time_min_tue_k_",
"b_time_min_tue_l_",
"b_time_min_tue_m_",
"b_time_min_tue_n_",
"b_time_min_tue_p_",
"b_time_min_tue_q_",
"b_time_min_tue_r_",
"b_time_hrs_wed_a_",
"b_time_hrs_wed_b_",
"b_time_hrs_wed_c_",
"b_time_hrs_wed_d_",
"b_time_hrs_wed_e_",
"b_time_hrs_wed_f_",
"b_time_hrs_wed_g_",
"b_time_hrs_wed_h_",
"b_time_hrs_wed_i_",
"b_time_hrs_wed_j_",
"b_time_hrs_wed_k_",
"b_time_hrs_wed_l_",
"b_time_hrs_wed_m_",
"b_time_hrs_wed_n_",
"b_time_hrs_wed_o_",
"b_time_hrs_wed_p_",
"b_time_hrs_wed_q_",
"b_time_hrs_wed_r_",
"b_time_min_wed_a_",
"b_time_min_wed_b_",
"b_time_min_wed_c_",
"b_time_min_wed_d_",
"b_time_min_wed_e_",
"b_time_min_wed_f_",
"b_time_min_wed_g_",
"b_time_min_wed_h_",
"b_time_min_wed_i_",
"b_time_min_wed_j_",
"b_time_min_wed_k_",
"b_time_min_wed_l_",
"b_time_min_wed_m_",
"b_time_min_wed_n_",
"b_time_min_wed_o_",
"b_time_min_wed_p_",
"b_time_min_wed_q_",
"b_time_min_wed_r_",
"b_time_hrs_thu_a_",
"b_time_hrs_thu_b_",
"b_time_hrs_thu_c_",
"b_time_hrs_thu_d_",
"b_time_hrs_thu_e_",
"b_time_hrs_thu_f_",
"b_time_hrs_thu_g_",
"b_time_hrs_thu_h_",
"b_time_hrs_thu_i_",
"b_time_hrs_thu_j_",
"b_time_hrs_thu_k_",
"b_time_hrs_thu_l_",
"b_time_hrs_thu_m_",
"b_time_hrs_thu_n_",
"b_time_hrs_thu_o_",
"b_time_hrs_thu_p_",
"b_time_hrs_thu_q_",
"b_time_hrs_thu_r_",
"b_time_min_thu_a_",
"b_time_min_thu_b_",
"b_time_min_thu_c_",
"b_time_min_thu_d_",
"b_time_min_thu_e_",
"b_time_min_thu_f_",
"b_time_min_thu_g_",
"b_time_min_thu_h_",
"b_time_min_thu_i_",
"b_time_min_thu_j_",
"b_time_min_thu_k_",
"b_time_min_thu_l_",
"b_time_min_thu_m_",
"b_time_min_thu_n_",
"b_time_min_thu_p_",
"b_time_min_thu_q_",
"b_time_min_thu_r_",
"b_time_hrs_fri_a_",
"b_time_hrs_fri_b_",
"b_time_hrs_fri_c_",
"b_time_hrs_fri_d_",
"b_time_hrs_fri_e_",
"b_time_hrs_fri_f_",
"b_time_hrs_fri_g_",
"b_time_hrs_fri_h_",
"b_time_hrs_fri_i_",
"b_time_hrs_fri_j_",
"b_time_hrs_fri_k_",
"b_time_hrs_fri_l_",
"b_time_hrs_fri_m_",
"b_time_hrs_fri_n_",
"b_time_hrs_fri_o_",
"b_time_hrs_fri_p_",
"b_time_hrs_fri_q_",
"b_time_hrs_fri_r_",
"b_time_min_fri_a_",
"b_time_min_fri_b_",
"b_time_min_fri_c_",
"b_time_min_fri_d_",
"b_time_min_fri_e_",
"b_time_min_fri_f_",
"b_time_min_fri_g_",
"b_time_min_fri_h_",
"b_time_min_fri_i_",
"b_time_min_fri_j_",
"b_time_min_fri_k_",
"b_time_min_fri_l_",
"b_time_min_fri_m_",
"b_time_min_fri_n_",
"b_time_min_fri_o_",
"b_time_min_fri_p_",
"b_time_min_fri_q_",
"b_time_min_fri_r_",
"b_time_hrs_sat_a_",
"b_time_hrs_sat_b_",
"b_time_hrs_sat_c_",
"b_time_hrs_sat_d_",
"b_time_hrs_sat_e_",
"b_time_hrs_sat_f_",
"b_time_hrs_sat_g_",
"b_time_hrs_sat_h_",
"b_time_hrs_sat_i_",
"b_time_hrs_sat_j_",
"b_time_hrs_sat_k_",
"b_time_hrs_sat_l_",
"b_time_hrs_sat_m_",
"b_time_hrs_sat_n_",
"b_time_hrs_sat_o_",
"b_time_hrs_sat_p_",
"b_time_hrs_sat_q_",
"b_time_hrs_sat_r_",
"b_time_min_sat_a_",
"b_time_min_sat_b_",
"b_time_min_sat_c_",
"b_time_min_sat_d_",
"b_time_min_sat_e_",
"b_time_min_sat_f_",
"b_time_min_sat_g_",
"b_time_min_sat_h_",
"b_time_min_sat_i_",
"b_time_min_sat_j_",
"b_time_min_sat_k_",
"b_time_min_sat_l_",
"b_time_min_sat_m_",
"b_time_min_sat_n_",
"b_time_min_sat_o_",
"b_time_min_sat_p_",
"b_time_min_sat_q_",
"b_time_min_sat_r_",
"b_earn_g_",
"b_earn_k_",
"b_earn_m_",
"b_earn_n_",
"e_live_",
"e_nenroll_",
"e_grade_",
"e_scheme_",
"e_schmtyp_",
"e_schmtyp_a_",
"e_schmtyp_g_",
"e_schmtyp_h_",
"e_schmsrc_",
"e_schmsrc_a_",
"e_schmsrc_c_",
"e_schmsrc_f_",
"e_schmsrc_g_",
"e_schmsrc_h_",
"e_emp_a_",
"e_emp_d_",
"e_emp_e_",
"e_emp_f_",
"e_emp_g_",
"e_emp_h_",
"e_emp_i_",
"e_emp_j_",
"e_emp_l_",
"e_emp_m_",
"e_emp_n_",
"e_emp_o_masked",
"e_emp_p_",
"e_emp_q_",
"e_emp_r_",
"e_ownfarm_",
"e_tobowntime_",
"e_tobactivity_b_",
"e_tobactivity_c_",
"e_tobactivity_f_",
"e_tobactivity_j_",
"e_homehrs_",
"e_hrsun",
"e_hrmon",
"e_hrtues",
"e_hrwed",
"e_hrthur",
"e_hrfri",
"e_hrsat",
"e_bhrsun",
"e_bhrmon",
"e_bhrtues",
"e_bhrwed",
"e_bhrthur",
"e_bhrfri",
"e_bhrsat",
"e_chrsun",
"e_chrmon",
"e_chrtues",
"e_chrwed",
"e_chrthur",
"e_chrfri",
"e_chrsat",
"e_dhrsun",
"e_dhrmon",
"e_dhrtues",
"e_dhrwed",
"e_dhrthur",
"e_dhrfri",
"e_dhrsat",
"e_ehrsun",
"e_ehrmon",
"e_ehrtues",
"e_ehrwed",
"e_ehrthur",
"e_ehrfri",
"e_ehrsat",
"e_fhrsun",
"e_fhrmon",
"e_fhrtues",
"e_fhrwed",
"e_fhrthur",
"e_fhrfri",
"e_fhrsat",
"e_ghrsun",
"e_ghrmon",
"e_ghrtues",
"e_ghrwed",
"e_ghrthur",
"e_ghrfri",
"e_ghrsat",
"e_hhrsun",
"e_hhrmon",
"e_hhrtues",
"e_hhrwed",
"e_hhrthur",
"e_hhrfri",
"e_hhrsat",
"e_ihrsun",
"e_ihrmon",
"e_ihrtues",
"e_ihrwed",
"e_ihrthur",
"e_ihrfri",
"e_ihrsat",
"e_jhrsun",
"e_jhrmon",
"e_jhrtues",
"e_jhrwed",
"e_jhrthur",
"e_jhrfri",
"e_jhrsat",
"e_khrsun",
"e_khrmon",
"e_khrtues",
"e_khrwed",
"e_khrthur",
"e_khrfri",
"e_khrsat",
"e_lhrsun",
"e_lhrmon",
"e_lhrtues",
"e_lhrwed",
"e_lhrthur",
"e_lhrfri",
"e_lhrsat",
"e_mhrsun",
"e_mhrmon",
"e_mhrtues",
"e_mhrwed",
"e_mhrthur",
"e_mhrfri",
"e_mhrsat",
"e_nhrsun",
"e_nhrmon",
"e_nhrtues",
"e_nhrwed",
"e_nhrthur",
"e_nhrfri",
"e_nhrsat",
"e_ohrsun",
"e_ohrmon",
"e_ohrtues",
"e_ohrwed",
"e_ohrthur",
"e_ohrfri",
"e_ohrsat",
"e_phrsun",
"e_phrmon",
"e_phrtues",
"e_phrwed",
"e_phrthur",
"e_phrfri",
"e_phrsat",
"e_qhrsun",
"e_qhrmon",
"e_qhrtues",
"e_qhrwed",
"e_qhrthur",
"e_qhrfri",
"e_qhrsat",
"e_rhrsun",
"e_rhrmon",
"e_rhrtues",
"e_rhrwed",
"e_rhrthur",
"e_rhrfri",
"e_rhrsat",
"e_thrsun",
"e_thrmon",
"e_thrtues",
"e_thrwed",
"e_thrthur",
"e_thrfri",
"e_thrsat",
"e_earn_a__masked",
"e_earn_b__masked",
"e_earn_c__masked",
"e_earn_d__masked",
"e_earn_e__masked",
"e_earn_f__masked",
"e_earn_g__masked",
"e_earn_h__masked",
"e_earn_i__masked",
"e_earn_j__masked",
"e_earn_l__masked",
"e_earn_m__masked",
"e_earn_n__masked",
"e_earn_p__masked",
"e_earn_q__masked",
"e_earn_r__masked",
"e_tobseas_",
"e_tobhrsown_a_",
"e_tobhrsown_b_",
"e_tobearn__masked",
"e_othhaz_",
"e_othhaz_a_",
"e_exp_",
"e_exp_e_",
"e_timeofday_",
"e_adulttob_",
"e_religion_",
"e_tribe_",
"e_savings",
"e_savingsall",
"e_vslawho",
"e_vslawho_e",
"e_vslawho_g",
"e_vslawho_h",
"e_vslawho_i",
"e_vslawho_j",
"e_vslalen_num",
"e_vslapay",
"e_vslause",
"e_vslause_g",
"e_vslause_i",
"e_vslause_j",
"e_loanwhere",
"e_loanuse",
"e_loanuse_g",
"e_inc1a_9_masked",
"e_inc1a_11_masked",
"e_inca_b",
"e_inc3a",
"b_tribe_")
capture_tables (indirect_PII)
# Recode races or ethnicity to reduce risk of re-identification
haven_table("e_tribe_")
## e_tribe_. e_Tribe
## 1 2 3 4 5 6 7 8 9 10 11 12 13 77 88 <NA>
## 15679 9 190 3170 86 22 230 15 21 58 6 5 17 51 4 18202
#val_labels(mydata$b_tribe_)
mydata$b_tribe_ <- recode(mydata$b_tribe_,
`1`=1L,
`2`=77L,
`3`=3L,
`4`=4L,
`5`=5L,
`6`=77L,
`7`=7L,
`8`=77L,
`9`=77L,
`10`=10L,
`11`=77L,
`12`=77L,
`13`=13L,
`77`=77L,
`88`=88L)
mydata$e_tribe_ <- recode(mydata$e_tribe_,
`1`=1L,
`2`=77L,
`3`=3L,
`4`=4L,
`5`=5L,
`6`=77L,
`7`=7L,
`8`=77L,
`9`=77L,
`10`=10L,
`11`=77L,
`12`=77L,
`13`=13L,
`77`=77L,
`88`=88L)
# Recode those with very specific values.
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
mydata$sex_summary <- cbind(mydata$b_sex_, mydata$e_sex_)
mydata$age_summary <- cbind(mydata$b_age_masked, mydata$e_age_masked)
educ_vars<- c("b_chprepri", "b_chpri", "b_chuppri", "b_chsec", "b_chhighersec")
mydata$edu_sum1 <- names(mydata[educ_vars])[max.col(mydata[educ_vars])]
educ_vars<- c("e_chprepri", "e_chpri", "e_chuppri", "e_chsec", "e_chhighersec")
mydata$edu_sum2 <- names(mydata[educ_vars])[max.col(mydata[educ_vars])]
mydata$edu_summary <- paste(mydata$edu_sum1, mydata$edu_sum2)
selectedKeyVars = c("sex_summary", "age_summary", "edu_summary")
selectedHouseholdID = c('hhid')
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
## Warning in cbind(reshier, unique(dataX[, 1])): number of rows of result is not a multiple of vector length (arg 1)
sdcInitial
## The input dataset consists of 37765 rows and 1335 variables.
## --> Categorical key variables: sex_summary, age_summary, edu_summary
## --> Cluster/Household-Id variable: hhid
## ----------------------------------------------------------------------
## Warning in cbind(stats_rec[, 1], stats_rec[, 2], paste0("(", stats_o[, 2], : number of rows of result is not a multiple of vector
## length (arg 3)
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## sex_summary.V1 4 (4) 6067.000 (6067.000) 3 (3)
## sex_summary.V2 3 (3) 9781.500 (9781.500) 9547 (9547)
## age_summary.V1 2 (6) 8623.000 (1779.200) 8623 (113)
## age_summary.V2 15 (4) 1389.714 (6067.000) 457 (3)
## edu_summary 11 (3) 3433.182 (9781.500) 51 (9547)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 0 (0.000%)
## - 5-anonymity: 0 (0.000%)
##
## ----------------------------------------------------------------------
dropvars <- c("sex_summary",
"age_summary",
"edu_summary",
"edu_sum1",
"edu_sum2")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!! Identify open-end variables here:
open_ends <- c("b_live_sp_",
"b_othhaz_",
"b_vsla_desc",
"b_loanaccess_get",
"b_klabwhy",
"b_kidlaw_d",
"e_loanaccess_where",
"e_klabwhy",
"e_kidlaw_d",
"e_kidlaw_e",
"e_type")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# Redrafted information in Local Language
mydata$b_vsla_desc[19578] <- "[Local Language]"
mydata$b_vsla_desc[19912] <- "[Local Language]"
mydata$b_vsla_desc[21634] <- "[Local Language]"
mydata$b_vsla_desc[24756] <- "[Local Language]"
mydata$b_vsla_desc[26740] <- "[Local Language]"
mydata$b_vsla_desc[27560] <- "[Local Language]"
mydata$b_vsla_desc[29800] <- "[Local Language]"
mydata$b_vsla_desc[29820] <- "[Local Language]"
mydata$b_vsla_desc[29823] <- "[Local Language]"
mydata$b_vsla_desc[30504] <- "[Local Language]"
mydata$b_vsla_desc[30975] <- "[Local Language]"
mydata$b_vsla_desc[31626] <- "[Local Language]"
mydata$b_vsla_desc[33112] <- "[Local Language]"
mydata$b_vsla_desc[34433] <- "[Local Language]"
mydata$b_vsla_desc[35943] <- "[Local Language]"
mydata$b_vsla_desc[35949] <- "[Local Language]"
# Setup map
# !!!No GPS
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)