> sapply(ds, is.numeric)
date location min_temp max_temp rainfall
FALSE FALSE TRUE TRUE TRUE
evaporation sunshine wind_gust_dir wind_gust_speed wind_dir_9am
TRUE TRUE FALSE TRUE FALSE
wind_dir_3pm wind_speed_9am wind_speed_3pm humidity_9am humidity_3pm
FALSE TRUE TRUE TRUE TRUE
pressure_9am pressure_3pm cloud_9am cloud_3pm temp_9am
TRUE TRUE TRUE TRUE TRUE
temp_3pm rain_today risk_mm rain_tomorrow
TRUE FALSE TRUE FALSE
> which(sapply(ds, is.numeric))
min_temp max_temp rainfall evaporation sunshine
3 4 5 6 7
wind_gust_speed wind_speed_9am wind_speed_3pm humidity_9am humidity_3pm
9 12 13 14 15
pressure_9am pressure_3pm cloud_9am cloud_3pm temp_9am
16 17 18 19 20
temp_3pm risk_mm
21 23
> ds[which(sapply(ds, is.numeric))]347 4 12.6 22.3 0.0348 7 16.3 23.2 13.2349 3 14.5 19.4 0.6350 4 11.6 18.4 0.0351 1 9.6 19.2 0.0352 1 11.6 21.9 0.0353 7 12.7 23.7 0.0354 6 16.8 27.4 0.2355 1 16.4 26.3 0.0356 5 11.4 18.5 0.8357 5 8.3 14.3 0.0358 2 9.1 16.3 0.0359 8 9.4 19.1 0.0360 3 12.0 24.8 0.0361 8 16.3 25.9 0.0362 3 20.4 30.0 0.0363 1 17.2 28.2 0.0364 2 14.5 18.3 0.0365 7 15.8 25.9 0.0366 1 23.8 28.6 0.0> cor(ds[which(sapply(ds, is.numeric))], use="complete.obs")min_temp max_temp rainfall evaporation sunshine
min_temp 1.00000000 0.74957059 0.19262842 0.64591028 0.02922045
max_temp 0.74957059 1.00000000 -0.08426446 0.68687502 0.45347679
rainfall 0.19262842 -0.08426446 1.00000000 -0.01680014 -0.15458080
evaporation 0.64591028 0.68687502 -0.01680014 1.00000000 0.31973353
sunshine 0.02922045 0.45347679 -0.15458080 0.31973353 1.00000000
wind_gust_speed 0.19815186 0.08856728 0.09049418 0.27327633 0.08714998
wind_speed_9am 0.12973234 -0.21632154 0.22529814 0.06940095 -0.06485128
wind_speed_3pm -0.08636593 -0.18523246 0.04561951 0.04082213 0.06559565
humidity_9am -0.20501300 -0.35917528 0.14996922 -0.52024479 -0.49833660
humidity_3pm -0.03888506 -0.53507252 0.28986682 -0.39012837 -0.76094239
pressure_9am -0.48993043 -0.27449041 -0.33061469 -0.37115877 0.01634769
pressure_3pm -0.48605859 -0.36546182 -0.24557034 -0.38241479 -0.02692562
cloud_9am 0.20752589 -0.18157383 0.17100694 -0.11117877 -0.68845933
cloud_3pm 0.11055269 -0.14913071 0.13391997 -0.11082361 -0.66297321
temp_9am 0.91564396 0.86972800 0.06835645 0.70348328 0.21555745
temp_3pm 0.72030877 0.98910491 -0.09763260 0.66850386 0.47267524
risk_mm 0.21670559 0.02559766 0.09307284 0.07579224 -0.38226246
> mc <- cor(ds[which(sapply(ds, is.numeric))], use="complete.obs")
> mc[upper.tri(mc, diag=TRUE)] <- NA
> library(tidyr)
> mc <- mc %>%
abs() %>%
data.frame() %>%
mutate(var1=row.names(mc)) %>%
gather(var2, cor, -var1) %>%
na.omit()
> mc <- mc[order(-abs(mc$cor)),]> mc
var1 var2 cor
33 temp_3pm max_temp 0.989104911
182 pressure_3pm pressure_9am 0.966604486
15 temp_9am min_temp 0.915643957
32 temp_9am max_temp 0.869727997
254 temp_3pm temp_9am 0.843836925
78 humidity_3pm sunshine 0.760942386
2 max_temp min_temp 0.749570594
16 temp_3pm min_temp 0.720308771
지나치게 상관관계가 높은 것들을 제외하고 나머지 가져갈 쌍들을 정한다.
사용자 판단의 문제라 매뉴얼하게 처리할 수 밖에 없는데, 보통 상관도 0.95 를 기준으로 한다.
ignore <- union(ignore, c("temp_3pm", "pressure_9am", "temp_9am"))
- apply function 에 대해서는 아래 링크 참조
http://blog.naver.com/beingawesome/220198554418
'프로그래밍 Programming' 카테고리의 다른 글
Data Preparation (18) - Prepare (Numeric and Categoric Variables) (0) | 2014.12.06 |
---|---|
Data Preparation (17) - Prepare (Variables) (0) | 2014.12.06 |
Data Preparation (16) - Clean (Ensure Target is Categoric) (0) | 2014.12.06 |
Data Preparation (15) - Clean (Normalise Factors) (0) | 2014.12.06 |
Data Preparation (14) - Clean (Omitting Observations) (0) | 2014.12.06 |