728x90

본 실습 내용은 패스트캠퍼스 강의임을 먼저 알립니다 ~!

 

고객 데이터를 통해서 은행상품의 마케팅 효과 알아보는 실습 내용입니다.

변수는 총 20개이며 맨끝 데이터가 target 입니다. 

전처리 및 모델 적용내용은 실습코드 안에 주석으로 했습니다.

rawdata1 <- read.csv('bank.csv', header=TRUE)
str(rawdata1)

#피처확인
unique(rawdata1$age)
unique(rawdata1$job) #unknown 처리 필요(보통 결측치로 처리)
unique(rawdata1$marital) #unknown 
unique(rawdata1$education) #unknown
unique(rawdata1$default)#unknown
unique(rawdata1$loan)#unknown
unique(rawdata1$contact)#unknown
unique(rawdata1$month)
unique(rawdata1$day_of_week)
unique(rawdata1$duration)
unique(rawdata1$campaign)
unique(rawdata1$previous)
unique(rawdata1$poutcome)
unique(rawdata1$emp.var.rate)
unique(rawdata1$nr.employed)
unique(rawdata1$target) #지도학습-이진분류

#결측치 처리
# unknown =>NA 변환
sum(rawdata1=='unknown')
rawdata1[rawdata1=='unknown'] <- NA #비어 있는 값으로 
sum(is.na(rawdata1))

#결측치 행 제거
rawdata1 <- na.omit(rawdata1) #결측치 포함한 행 삭제
str(rawdata1)

#히스토그램
#데이터 표준화가 필요 하다는 걸 알수 있다.
par(mfrow=c(3,3), mar=c(5.1, 4.1, 4.1, 2.1))
hist(rawdata1$age, main="age histogram", xlab="age", col="orange")
hist(rawdata1$duration, main="duration histogram", xlab="duration", col="yellow")
hist(rawdata1$campaign, main="campaign histogram", xlab="campaign", col="green")
hist(rawdata1$previous, main="previous histogram", xlab="previous", col="blue")
hist(rawdata1$emp.var.rate, main="emp.var.rate historgram", xlab="emp.var.rate", col="navy")
hist(rawdata1$cons.price.idx, main="cons.price.idx histogram", xlab="cons.price.idx", col="purple")
hist(rawdata1$cons.conf.idx, main="cons.conf.idx histogram", xlab="cons.conf.idx", col="salmon")
hist(rawdata1$euribor3m, main="euribor3m histogram", xlab="euribor3m", col="gray")
hist(rawdata1$nr.employed, main="nr.employed histogram", xlab="nr.employed", col="black")

#표준화: 숫자형 변수
rawdata1$age <- scale(rawdata1$age)
rawdata1$duration <- scale(rawdata1$duration)
rawdata1$campaign <- scale(rawdata1$campaign)
rawdata1$previous <- scale(rawdata1$previous)
rawdata1$emp.var.rate <- scale(rawdata1$emp.var.rate)
rawdata1$cons.price.idx <- scale(rawdata1$cons.price.idx)
rawdata1$cons.conf.idx <- scale(rawdata1$cons.conf.idx)
rawdata1$euribor3m <- scale(rawdata1$euribor3m)
rawdata1$nr.employed <- scale(rawdata1$nr.employed)

#표준화 후 시각화 
par(mfrow=c(3,3), mar=c(5.1, 4.1, 4.1, 2.1))
hist(rawdata1$age, main="age histogram", xlab="age", col="orange")
hist(rawdata1$duration, main="duration histogram", xlab="duration", col="yellow")
hist(rawdata1$campaign, main="campaign histogram", xlab="campaign", col="green")
hist(rawdata1$previous, main="previous histogram", xlab="previous", col="blue")
hist(rawdata1$emp.var.rate, main="emp.var.rate historgram", xlab="emp.var.rate", col="navy")
hist(rawdata1$cons.price.idx, main="cons.price.idx histogram", xlab="cons.price.idx", col="purple")
hist(rawdata1$cons.conf.idx, main="cons.conf.idx histogram", xlab="cons.conf.idx", col="salmon")
hist(rawdata1$euribor3m, main="euribor3m histogram", xlab="euribor3m", col="gray")
hist(rawdata1$nr.employed, main="nr.employed histogram", xlab="nr.employed", col="black")

#미리 펴준화를 하는 이유
#caret에서 preprocess로 표준화는 오직 숫자로된 변수만 구성되어 있을때나 가능하다. 이번 데이터는 문자형이 섞여 있으므로 미리 표준화를 해주었다.

#범주형 자료 시각화: barplot, pie
par(mfrow=c(3,3), mar=c(5.1,4.1,4.1,2.1))
barplot(prop.table(table(rawdata1$job)), main='직업 비율')
barplot(prop.table(table(rawdata1$marital)), main='결혼여부')
barplot(prop.table(table(rawdata1$education)), main='교육수준')
barplot(prop.table(table(rawdata1$default)), main='파산여부')
barplot(prop.table(table(rawdata1$housing)), main='주택대출여부')
barplot(prop.table(table(rawdata1$loan)), main='개인대출여부')
barplot(prop.table(table(rawdata1$contact)), main='연락방법')
barplot(prop.table(table(rawdata1$month)), main='마지막 연락 달')
barplot(prop.table(table(rawdata1$day_of_week)), main='마지막 연락 요일')

#주성분 분석(보통 90%이상을 대상으로 실행)
num_feature <- c("age","duration",'campaign', 'previous','emp.var.rate', "cons.price.idx", "cons.conf.idx", "euribor3m","nr.employed")

tar <- rawdata1[,"target"]
num_data <- rawdata1[,num_feature]
pca_num <- prcomp(num_data)
plot(pca_num, type='l', main='Principle') #세로축이 높을수록 변동성의 설명력이 높다. 
summary(pca_num)

#rotation : 주성분 벡터
#%*%: 행렬 곱, 원본데이터 * 주성분 데이터 
pca_matrix <- pca_num$rotation
pca_matrix
pca_data <- as.matrix(num_data)%*%pca_matrix
reduced_data <- data.frame(cbind(pca_data[,1:3],tar)) #cbind: 열결합
reduced_data
reduced_data$tar <- as.factor(reduced_data$tar)
str(reduced_data)

#차원 축소를 통한 시각화
library(ggplot2)
ggplot(data=reduced_data, aes(x=PC1, y=PC2))+geom_point(aes(color=tar, shape=tar))+xlab("PC1")+ylab('PC2')+ggtitle("PCA DATA")


#install.packages("scatterplot3d")
library(scatterplot3d)
shapes = c(20, 21) #1,2 => 20, 21
shapes <- shapes[as.numeric(reduced_data$tar)]
scatterplot3d(reduced_data[,1:3], color = reduced_data[,"tar"], pch = shapes, angle=45)
#pch =모양

#데이터분리
set.seed(2021)
newdata <- rawdata1
datatotal <- sort(sample(nrow(newdata), nrow(newdata)*0.7))
train <- newdata[datatotal,]
test <- newdata[-datatotal,]

#===============================
#1.로지스틱
library(caret)
ctrl <- trainControl(method='repeatedcv',repeats=5)
logit_fit <- train(target~.,
              data=train,
              method='glm', #generalized linear model=로지스틱
              trControl = ctrl,
              metric='Accuracy') 
logit_fit            
#예측
logit_pred <- predict(logit_fit,newdata=test)
confusionMatrix(logit_pred,test$target)

#===============================
#2. boosted 로지스틱
library(caret)
ctrl <- trainControl(method='repeatedcv',repeats=5)
logit_boosted_fit <- train(target~.,
                   data=train,
                   method='LogitBoost',
                   trControl = ctrl,
                   metric='Accuracy') 
logit_boosted_fit            
#예측
logit_boosted_pred <- predict(logit_boosted_fit,newdata=test)
confusionMatrix(logit_boosted_pred,test$target)
728x90

+ Recent posts