-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_2022_WSHFC_data.R
More file actions
262 lines (234 loc) · 9.45 KB
/
clean_2022_WSHFC_data.R
File metadata and controls
262 lines (234 loc) · 9.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#################################################################################
# Title: Cleaning 2022 WSHFC data
# Author: Eric Clute
# Date created: 2022-11-30
# Last Updated: 2024-08-02
#################################################################################
## load packages-----------------------------------------------------------------
library(tidyverse)
library(readxl)
library(janitor)
library(data.table)
## 1) Set variables ---------------------------------------------------------------------
setwd("C:/Users/eclute/GitHub/irhd")
WSHFC_path <- "J:/Projects/IncomeRestrictedHsgDB/2022 vintage/Data/WSHFC/"
WSHFC_raw <- read_xlsx(paste0(WSHFC_path, "PSRC report for 2022.xlsx"))
vintage_year_cleaning_script = "2022"
address_func <- "./address_match.R"
remotes::install_github("slu-openGIS/postmastr")
source(address_func)
## 2) function --------------------------------------------------------------------
#create function to select and arrange columns needed for joining
select_and_arrange_columns_function <- function(df){
df <- df %>%
select(any_of(c("data_source",
"project_id",
"project_name",
"property_id",
"property_name",
"property_owner",
"manager",
"in_service_date",
"expiration_date",
"reported_address",
"city",
"zip",
"county",
"total_units",
"total_restricted_units",
"ami_20",
"ami_25",
"ami_30",
"ami_35",
"ami_40",
"ami_45",
"ami_50",
"ami_60",
"ami_65",
"ami_70",
"ami_75",
"ami_80",
"ami_85",
"ami_90",
"ami_100",
"ami_120",
"market_rate",
"manager_unit",
"bedroom_0",
"bedroom_1",
"bedroom_2",
"bedroom_3",
"bedroom_4",
"bedroom_5",
"bedroom_unknown",
"bed_count",
"site_type",
"home",
"HOMEcity",
"HOMEcounty",
"HOMEstate",
"confidentiality",
"policy",
"senior",
"disabled",
"farmworker",
"homeless",
"sro",
"large_household",
"transitional",
"veterans",
"funding_sources",
"tenure")))
}
## 3) clean WSHFC data --------------------------------------------------------------------
# ------- DATA FILTER #1 ------- filter by county, create/modify fields
WSHFC_cleaned <- WSHFC_raw %>%
filter(County == "Snohomish" | County == "Pierce" | County == "Kitsap")
#create grouped funder column
WSHFC_cleaned %<>%
group_by(`Site Name`, `Address`) %>%
mutate(Funder = paste(sort(unique(Funder)), collapse = ","))
# ------- DATA FILTER #2 ------- select entry with the largest total restricted unit count
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_max(`Income & Rent Restricted Units`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
unique() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address) # %>%
# view()
# ------- DATA FILTER #3 ------- select only entry with latest expiration date
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_max(`Project Expiration Date`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
unique() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address)# %>%
# view()
# ------- DATA FILTER #4 ------- select entry with earliest in service date
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_min(`First Credit Year or C of O's`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
distinct() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address)# %>%
# view()
# ------- DATA FILTER #4 ------- for entries where there are multiple properties with the same total restricted unit count but different other data, select record that seems correct
WSHFC_cleaned <- WSHFC_cleaned %>%
distinct() %>%
filter(!(`Project Name` == "Annobee Apartments, The" & `Site Name` == "Annobee Apartments, The" & `80%` == 43)) %>% #remove this record, keep record with pop served & deeper affordability
filter(!(`Project Name` == "Catalina Apartments" & `Site Name` == "Catalina Apartments" & `40%` == 32)) %>% #remove this record, keep record with pop served & deeper affordability
filter(!(`Project Name` == "Maternity Shelter (Youth Emergency Shelter (YES) North)" & `Site Name` == "Youth Emergency Shelter (YES) North" & `50%` == 8)) #remove this record, keep record with deeper affordability
#check to see if any duplicates remaining - should be 0
WSHFC_cleaned %>%
distinct() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address)# %>%
# view()
# ------- DATA FILTER #5 ------- Small edits/checks
#Filter by InServiceDate - select only records in this vintage year or earlier
WSHFC_cleaned <- WSHFC_cleaned %>%
filter(`First Credit Year or C of O's` <= vintage_year_cleaning_script)
#Consolidate SRO and STUDIO into one column
WSHFC_cleaned$STUDIO = WSHFC_cleaned$SRO + WSHFC_cleaned$STUDIO
## 4) clean up field names --------------------------------------------------------------------
#rename columns and add empty columns for data we dont have
WSHFC_cleaned <- WSHFC_cleaned %>%
mutate(DataSource = as.character(NA),
ami_25 = as.numeric("0"),
ami_75 = as.numeric("0"),
ami_85 = as.numeric("0"),
ami_90 = as.numeric("0"),
ami_100 = as.numeric("0"),
ami_120 = as.numeric("0"),
market_rate = as.numeric("0"),
manager_unit = as.numeric("0"),
confidentiality = as.character(NA),
policy = as.character(NA),
tenure = as.character(NA)) %>%
rename(project_id = `ProjectKey`,
project_name = `Project Name`,
property_id = `SiteKey`,
property_name = `Site Name`,
property_owner = `Contractor/Owner Org`,
manager = `Property Management Org`,
city = `City`,
total_units = `Total Project Units`,
total_restricted_units = `Income & Rent Restricted Units`,
in_service_date = `First Credit Year or C of O's`,
ami_20 = `20%`,
ami_30 = `30%`,
ami_35 = `35%`,
ami_40 = `40%`,
ami_45 = `45%`,
ami_50 = `50%`,
ami_60 = `60%`,
ami_65 = `65%`,
ami_70 = `70%`,
ami_80 = `80%`,
bedroom_0 = `STUDIO`,
bedroom_1 = `1 BR`,
bedroom_2 = `2 BR`,
bedroom_3 = `3 BR`,
bedroom_4 = `4 BR`,
bedroom_5 = `5 BR`,
bedroom_unknown = `Unknown`,
bed_count = `GROUP HOME/BED`,
home = `Number of HOME Units`,
HOMEcity = `HOME City`,
HOMEcounty = `HOME County`,
HOMEstate = `HOME State`,
funding_sources = `Funder`,
expiration_date = `Project Expiration Date`,
large_household = `Large Household (4+ pp)`,
site_type = `Site Type`,
senior = `Elderly`,
disabled = `Persons with Disabilities`,
reported_address = `Address`,
county = `County`,
farmworker = `Farmworker`,
sro = `SRO`,
homeless = `Homeless`,
transitional = `Transitional`,
data_source = `DataSource`,
veterans = `Veterans`,
zip = `Zip`)
#select only necessary columns and arrange columns
WSHFC_cleaned <- select_and_arrange_columns_function(WSHFC_cleaned)
#set DataSource field
WSHFC_cleaned$data_source = "WSHFC"
WSHFC_cleaned$reported_address[WSHFC_cleaned$reported_address == '1724 E. 44th'] <- '1724 E 44th Street'
WSHFC_cleaned$reported_address[WSHFC_cleaned$reported_address == '9225 Bayshore Drive NW'] <- '9225 Bay Shore Dr NW'
WSHFC_cleaned$reported_address[WSHFC_cleaned$reported_address == '9239 Bayshore Dr NW'] <- '9239 Bay Shore Dr NW'
#clean address field for matching
WSHFC_cleaned$full_address <- str_c(WSHFC_cleaned$reported_address,', ',WSHFC_cleaned$city,', WA ',WSHFC_cleaned$zip)
WSHFC_cleaned <- as.data.frame(WSHFC_cleaned)
WSHFC_cleaned <- add_cleaned_addresses(WSHFC_cleaned) %>% setDT()
#set proper data types for matching
WSHFC_cleaned$data_source <- as.character(WSHFC_cleaned$data_source)
WSHFC_cleaned$project_id <- as.character(WSHFC_cleaned$project_id)
WSHFC_cleaned$property_id <- as.character(WSHFC_cleaned$property_id)
WSHFC_cleaned$in_service_date <- as.character(WSHFC_cleaned$in_service_date)
rm(WSHFC_raw, WSHFC_path, select_and_arrange_columns_function,vintage_year_cleaning_script)