KBO 타자 OPS 예측 실습(데이콘 경진대회 1등 솔루션)

실습한 내용은 데이콘 경진대회 1등 솔루션 책 내용입니다.
제 1장인 KBO 타자 OPS 예측 실습 내용을 포스팅 하겠습니다.
전체적인 진행 설명은 파일안에 기록했습니다

KBO 타자 OPS 예측

#import 
from matplotlib import font_manager, rc
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import platform

#window 폰트 설정
font_name=font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)

#그래프의 마이너스 표시가능 설정
matplotlib.rcParams['axes.unicode_minus']=False

1.EDA(탐색적 데이터 분석)

프리시즌 살펴보기

#프리시즌 데이터로드
preseason_df=pd.read_csv('D:/dacon/KBO 타자 OPS 예측/Pre_Season_Batter.csv')
#정규시즌 데이터로드
regular_season_df=pd.read_csv('D:/dacon/KBO 타자 OPS 예측/Regular_Season_Batter.csv')
#데이터크기 확인
print(preseason_df.shape)
#데이터 상단 출력 
display(preseason_df.head())

#데이터 기초통계량 확인
display(preseason_df.describe())

#데이터 시각화
preseason_df.hist(figsize=(10,9))
plt.tight_layout() # 그래프 간격 설정
plt.show()

#정규시즌 데이터에서 2002년 이후의 연도별기록된 선수의 수
regular_count=regular_season_df.groupby('year')['batter_name'].count().rename('regular')

#프리시즌 데이터에서 2002년 이후의 연도별기록된 선수의 수
preseason_count=preseason_df.groupby('year')['batter_name'].count().rename('preseason')

#합치기
pd.concat([regular_count,preseason_count, np.round(preseason_count/regular_count,2).rename('ratio')],axis=1).transpose().loc[:,2002:]

#타자의 이름과 연도를 이용해 새로운 인덱스를 생성
regular_season_df['new_idx']=regular_season_df['batter_name']+regular_season_df['year'].apply(str)
preseason_df['new_idx']=preseason_df['batter_name']+preseason_df['year'].apply(str)

#새로운 인덱스의 교집합
intersection_idx=list(set(regular_season_df['new_idx']).intersection(preseason_df['new_idx']))
#ket_point: intersaction을 활용한 교집합 

#교집합에 존재하는 데이터만 불러오기
regular_season_new=regular_season_df.loc[regular_season_df['new_idx'].apply(lambda x:x in intersection_idx)]
regular_season_new=regular_season_new.sort_values(by='new_idx').reset_index(drop=True)

#비교를 위한 인덱스 정렬
preseason_new=preseason_df.loc[preseason_df['new_idx'].apply(lambda x:x in intersection_idx)]
preseason_new=preseason_new.sort_values(by='new_idx').reset_index(drop=True)

#검정코드
print(preseason_new.shape, preseason_new.shape)
sum(preseason_new['new_idx']==regular_season_new['new_idx'])

(1358, 30) (1358, 30)
1358

intersaction을 활용한 교집합+set을 이용한 중복제거(두시즌 모두 참여한 선수 추출)
apply와 lambda를 이용한 적용방법

#정규시즌과 프리시즌의 상관관계 계산
correlation=regular_season_new["OPS"].corr(preseason_new["OPS"])
sns.scatterplot(regular_season_new["OPS"], preseason_new["OPS"])
plt.title('correlation(상관계수):'+str(np.round(correlation,2)), fontsize=20)
plt.xlabel('정규시즌 OPS:', fontsize=12)
plt.ylabel('프리시즌 OPS:', fontsize=12)
plt.show()

정규시즌 데이터 분석

#기초통계량 확인
regular_season_df.describe()

#시각화 작업
regular_season_df.hist(figsize=(10,9))
plt.tight_layout()
plt.show()

plt.figure(figsize=(15,6))
plt.subplot(1,2,1) #1행 2열의 첫번쨰 (1행 1열) 그래프
g= sns.boxplot(x='year', y='OPS', data=regular_season_df, showfliers=False)
g.set_title('연도별 OPS 상자그림', size=20)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.subplot(1,2,2)
plt.plot(regular_season_df.groupby('year')['OPS'].median()) #OPS 값이 한쪽으로 치우쳐 있으므로 중앙값으로 평균을 낸다
plt.title('연도별 OPS 중앙값:', size=20)
plt.show()

#2000년도 이전의 변동폭이 크기 때문에 좀더 자세히 살펴본다
pd.crosstab(regular_season_df['year'], 'count').T
#2000년 이전의 데이터 수가 작아서 변동성이 컸음을 알수있음

#팀별/연도별 OPS 

#연도별 팀의 OPS 중앙값 계산
med_OPS=regular_season_df.pivot_table(index=['team'], columns='year', values="OPS", aggfunc="median")

#2005년 이후에 결측치가 존재 하지 않은 팀만 확인
team_idx=med_OPS.loc[:,2005:].isna().sum(axis=1)<=0  

plt.plot(med_OPS.loc[team_idx,2005:].T)
plt.legend(med_OPS.loc[team_idx,2005:].T.columns, loc='center left', bbox_to_anchor=(1,0.5)) #그래프 범례를 그래프 밖에 위치
plt.title('팀별 성적')
plt.show()

team_idx=med_OPS.loc[:,2005:].isna().sum(axis=1)<=0 결측치가 있는걸 제거 하는 코드

키와 몸무게가 성적과 관련이 있는지 확인

import re 
regular_season_df['weight']=regular_season_df['height/weight'].apply(lambda x: int(re.findall('\d+', x.split('/')[1])[0]) if pd.notnull(x) else x )

regular_season_df['height']=regular_season_df['height/weight'].apply(lambda x: int(re.findall('\d+', x.split('/')[0])[0]) if pd.notnull(x) else x )

print(regular_season_df['height/weight'][0], regular_season_df['weight'][0],regular_season_df['height'][0])

177cm/93kg 93.0 177.0

#몸무게/카 계산
regular_season_df['weight_per_height']=regular_season_df['weight']/ regular_season_df['height']
plt.figure(figsize=(15,5)) #그래프 조정
plt.subplot(1,2,1)

#비율과 출루율 상관관계
correlation= regular_season_df['weight_per_height'].corr(regular_season_df['OBP'])
sns.scatterplot(regular_season_df['weight_per_height'], regular_season_df['OBP'])
plt.title(" '몸무게/키'와 출루율 correlation(상관관계):"+str(np.round(correlation,2)),fontsize=15)
plt.ylabel('정규시즌 OBP', fontsize=12)
plt.xlabel('몸무게/키', fontsize=12)

#비율과 장타율 상관관계
plt.subplot(1,2,2)
correlation=regular_season_df['weight_per_height'].corr(regular_season_df['SLG'])
sns.scatterplot(regular_season_df['weight_per_height'],regular_season_df['SLG'])
plt.title(" '몸무게/키'dhk 장타율 correlation(상관관계)"+str(np.round(correlation,2)), fontsize=15)
plt.ylabel('정규시즌 SLG', fontsize=12)
plt.xlabel('몸무게/키', fontsize=12)
plt.show()

regular_season_df['position'].value_counts()

내야수(우투우타)    643
외야수(우투우타)    230
외야수(좌투좌타)    201
포수(우투우타)     189
외야수(우투좌타)    184
내야수(우투좌타)    141
내야수(좌투좌타)     36
포수(우투좌타)      14
내야수(우투양타)      7
외야수(우투양타)      7
Name: position, dtype: int64

# 포지션 세부적으로 분리하기
#postion
regular_season_df['pos']=regular_season_df['position'].apply(lambda x:x.split('(')[0] if pd.notnull(x) else x)

#우타,좌타,양타 : 손잡이
regular_season_df['hit_way']=regular_season_df['position'].apply(lambda x: x[-3:-1] if pd.notnull(x) else x)

print(regular_season_df['position'][0], regular_season_df['pos'][0], regular_season_df['hit_way'][0])

내야수(우투우타) 내야수 우타

선수들의 포지션을 통해서 왼손잡이,오른손잡이를 알아내고 성적과 상관관계를 알아내는 추론과정

plt.figure(figsize=(15,5)) 
plt.subplot(1,2,1)
ax= sns.boxplot(x='pos', y='OPS', data=regular_season_df,
                showfliers=False) # 박스 범위 벗어난 아웃라이어 표시하지 않기

#position별 ops 중앙값
median= regular_season_df.groupby('pos')['OPS'].median().to_dict() #{'내야수': 0.706, '외야수': 0.7190000000000001, '포수': 0.639}

#position 별 관측치 수 -> 그래프에 넣을 값
nobs=regular_season_df['pos'].value_counts().to_dict()  #{'내야수': 827, '외야수': 622, '포수': 203}

#키 값을 'n:값' 형식으로 변환하는 코드
for key in nobs: nobs[key] = "n:"+str(nobs[key])  #for key의 keyr가 아닌 다른 텍스트를 넣으면 n:n:n:값 형식으로 형태가 이상해짐
    
#그래프의 Xticks text 값 얻기
xticks_labels = [item.get_text() for item in ax.get_xticklabels()]  #['내야수', '외야수', '포수']


#ax안에 텍스트 위치와 내용 넣기
for label in ax.get_xticklabels(): #x축 인자 즉, 내야수,외야수, 포수를 차례대로 label 넣는다
#     print(xticks_labels.index(label.get_text()))  # 0,1,2 차례대로
#     print(label.get_text())                       #내야수 외야수 포수
    
    ax.text(xticks_labels.index(label.get_text()), #x의 위치--> 숫자로 인덱스가 출력
           median[label.get_text()]+0.03,          #y의 위치 
            nobs[label.get_text()],                #들어갈 텍스트 내용
           horizontalalignment='center', size='large', color='w', weight='semibold')
    print(label.get_text())
ax.set_title('포지션별 OPS')

plt.subplot(1,2,2)
ax= sns.boxplot(x='hit_way', y='OPS', data=regular_season_df, showfliers=False)

#타자 방향별 OPS 중앙값
median=regular_season_df.groupby('hit_way')['OPS'].median().to_dict() 

#타자 방향 관측치 수
nobs = regular_season_df['hit_way'].value_counts().to_dict()

#키 값을 'n:값' 형식으로 변환
for key in nobs: nobs[key] = 'n:'+str(nobs[key])

#그래프의 xticks text 값 얻기
xticks_labels=[item.get_text() for item in ax.get_xticklabels()] #hit_way의 인덱스가 리스트 형식으로 묶인다

#tick은 tick의 위치, label은 그에 해당하는 text 값
for label in ax.get_xticklabels():
    ax.text(
    xticks_labels.index(label.get_text()),
    median[label.get_text()]+0.03,
    nobs[label.get_text()], horizontalalignment='center', size= 'large',
    color='w', weight='semibold')
ax.set_title('타석방향별 OPS')
plt.show()

to_dict과 그래프 적용하는 과정
xticks_labels.index(label.get_text(): x축 index 추출 과정

커리어 변수를 이용하여 외/내국인 차이를 탐색

#career를 split
foreign_country = regular_season_df['career'].apply(lambda x:x.replace('-','').split(' ')[0])

#외국인만 추출
foreign_country_list= list(set(foreign_country.apply(lambda x:np.nan if '초' in x else x))) #초가 있으면 nan으로 처리하고 그게 아니라면 x출력
                                                                                              #nan이 1개인 이유 : set함수  
#결측치 처리 
foreign_country_list = [x for x in foreign_country_list if str(x) != 'nan']
foreign_country_list

['쿠바', '도미니카삼성', '캐나다', '도미니카', '네덜란드', '미국']

regular_season_df['country']=foreign_country
regular_season_df['country']=regular_season_df['country'].apply(lambda x: x if pd.isnull(x) else ('foreign' if x in foreign_country_list else 'korean'))
regular_season_df[['country']].head()

plt.figure(figsize=(15,5))
ax= sns.boxplot(x='country', y='OPS', data=regular_season_df, showfliers=False)

#국적별 OPS 중앙값 dict
median= regular_season_df.groupby(['country'])['OPS'].median().to_dict()

#내외국인 관측치 수
nobs = regular_season_df['country'].value_counts().to_dict()

#키 값을 n:값 형태로 변경
for key in nobs : nobs[key] = 'n:'+str(nobs[key]) #['foreign', 'korean']

#그래프의 Xticks text 값 얻기
xticks_labels=[item.get_text() for item in ax.get_xticklabels()]
for label in ax.get_xticklabels():
    ax.text(
    xticks_labels.index(label.get_text()),
        median[label.get_text()]+0.03,
        nobs[label.get_text()],
        horizontalalignment='center', size='large', color='w', weight='semibold')
ax.set_title('국적별 OPS')
plt.show()

#결측치라면 그대로 0으로 두고, 만원이 포함된다면 숫자만 뽑아서 초봉으로 넣어준다.
#그외 만원 단위가 아닌 초봉은 결측치로 처리한다.
regular_season_df['starting_salary']=regular_season_df['starting_salary'].apply(lambda x:x if pd.isnull(x) else(int(re.findall('\d+',x)[0]) if '만원' in x else np.nan))

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
b=sns.distplot(regular_season_df['starting_salary'].\
              loc[regular_season_df['starting_salary'].notnull()], hist=True)
b.set_xlabel('staring salary', fontsize=12)
b.set_title('초봉의 분포', fontsize=20)

plt.subplot(1,2,2)
correlation=regular_season_df['starting_salary'].corr(regular_season_df['OPS'])
b=sns.scatterplot(x=regular_season_df['starting_salary'], y=regular_season_df['OPS'])
b.axes.set_title('correlation(상관계수):'+str(np.round(correlation,2)), fontsize=20)
b.set_ylabel('정규시즌 OPS:',fontsize=12)
b.set_xlabel("초봉",fontsize=12)
plt.show()

일별 데이터 분석

day_by_day_df=pd.read_csv('D:/dacon/KBO 타자 OPS 예측/Regular_Season_Batter_Day_by_Day_b4.csv')
display(day_by_day_df.shape, day_by_day_df.head())

(112273, 20)

#날짜(date)를 '.'을 기준으로 나누고 첫 번째 값을 월(month)로 지정
day_by_day_df['month']=day_by_day_df['date'].apply(lambda x:str(x).split('.')[0]) #숫자는 split 안됨, str로 변경 후 사용

#각 연도의 월별 평균 누적 타율(avg2) 계산
agg_df= day_by_day_df.groupby(['year','month']).mean().reset_index()
agg_df

#피벗 데이털로 재구성하기
agg_df=day_by_day_df.pivot_table(index='month', columns='year', values='avg2')
agg_df

date변수를 월을 추출해서 월별 평균 타율을 추출과정

#그래프의 간소화를 위해 결측치가 있는 3월과 10월제외한다.
display(agg_df.iloc[2:,10:])
plt.plot(agg_df.iloc[2:,10:]) #2011~2018년도
plt.legend(agg_df.iloc[2:,10:].columns, loc='center left', bbox_to_anchor=(1,0.5)) #범례 그래프 밖에 위치
plt.title('연도별 평균 타율')
plt.show()

데이터 전처리

결측치 처리 및 데이터 오류 처리

# 수치형 타입의 변수 저장
numberics =['int16','int32','int64','float16','float32','float64']
num_cols=regular_season_df.select_dtypes(include=numberics).columns #columns을 붙여야 열 이름만 추출

수치형 타입을 이용해서 해당되는 데이터를 찾아내는 과정
select_dtypes

regular_season_df.loc[regular_season_df[num_cols].isna().sum(axis=1)>0 , num_cols].head() 
#ex 0번 인덱스에 하나라도 결측치가 있으면 num_cols와 매칭하여 보여준다.
#0번 인덱스에 결측치가 있을때 그 결측치가 num_cols중에 발생한 것인지 확인하기 위해서 필요한 코드

결측치 여부를 부등호로 처리

#수치형 변수에 포함되는 데이터 타입 선정
numberics =['int16','int32','int64','float16','float32','float64']

#정규시즌 데이터에서 결측치를 0으로 채우기
regular_season_df[regular_season_df.select_dtypes(include=numberics).columns]=regular_season_df[regular_season_df.select_dtypes(include=numberics).columns].fillna(0)

#일별 데이터에서 결측치를 0으로 채우기
day_by_day_df[day_by_day_df.select_dtypes(include=numberics).columns]=day_by_day_df[day_by_day_df.select_dtypes(include=numberics).columns].fillna(0)

#프리시즌 데이터에서 결측치를 0으로 채우기
preseason_df[preseason_df.select_dtypes(include=numberics).columns]=preseason_df[preseason_df.select_dtypes(include=numberics).columns].fillna(0)

#수치형 변수의 결측치를 다루기 전에 먼저 결측치의 현황을 파악 후 결측치 처리 방법을 정해야 한다
not_num_cols=[x for x in regular_season_df.columns if x not in num_cols ]

#수치형이 아닌 변수 중 결측치가 하나라도 존재하는 행 출력
regular_season_df.loc[regular_season_df[not_num_cols].isna().sum(axis=1)>0, not_num_cols].head()
#결측치 해당 변수는 분석에 사용안하므로 결측치 처리 안함

#잘못된 결측치 데이터를 삭제
#삭제할 데이터 추출 
drop_index= regular_season_df.loc[
    #안타가 0개 이상이면서 장타율이 0인 경우
    ((regular_season_df['H']>0) & (regular_season_df['SLG']>0))|
    
    #안타가 0개 이상 혹은 볼넷이 0개 이상 혹은 몸에 맞은 볼이 0개 이상이면서 출루율이 0인 경우
    (((regular_season_df['H']>0)|
     (regular_season_df['BB']>0)|
     (regular_season_df['HBP']>0))&
    (regular_season_df['OBP']==0))
].index

#데이터 삭제
regular_season_df=regular_season_df.drop(drop_index).reset_index(drop=True)

규정타수정의

#정규시즌 데이터로드
regular_season_df=pd.read_csv('D:/dacon/KBO 타자 OPS 예측/Regular_Season_Batter.csv')

plt.figure(figsize=(6,3))
plt.plot('AB','OPS', data=regular_season_df, linestyle='none', marker='o', markersize=2, color='blue', alpha=0.4)
plt.xlabel('AB', fontsize=14)
plt.ylabel('OPS', fontsize=14)
plt.xticks(list(range(min(regular_season_df['AB']),max(regular_season_df['AB']),30)),rotation=90)
plt.vlines(30,ymin=min(regular_season_df['OPS']), ymax=max(regular_season_df['OPS']),linestyle='dashed',colors='r')
plt.show()

#OPS 이상치 탐색을 위한 수치 정의 
Q1= regular_season_df['OPS'].quantile(0.25) 
Q3= regular_season_df['OPS'].quantile(0.75)
IQR=Q3-Q1

#실제 OPS 이상치 탐색
regular_season_df.loc[(regular_season_df['OPS']<(Q1-1.5*IQR))|
                     (regular_season_df['OPS']>(Q3+1.5*IQR))].sort_values(by=['AB'], axis=0, ascending=False)[['batter_name','AB','year','OPS']].head(10)

IQR를 통해 규장타수의 타당성을 확인

major_ticks= list(np.round(np.linspace(7.01,7.31,31),2))

july=(day_by_day_df['date']>=7) & (day_by_day_df['date']<8) #7월만 불러오는 index
plt.plot(major_ticks, day_by_day_df['date'].loc[july].value_counts().sort_index(), marker='o')
plt.xticks(major_ticks, rotation=90)
plt.show()

경기에 출전한 선수의 합을 통해서 휴식기를 알아낸다
이를 통해 상반기 하반기를 구별

시간변수

선수별 과거 성적을 생성하는 함수 정의

#시간변수를 생성하느 함수 정의
def lag_function(df,var_name, past):
    # df = 시간변수를 생성할 데이터 프레임
    # var_name= 시간변수 생성의 대상이 되는 변수 이름
    # past= 몇 년 전의 성적을 생성할지 결정(정수형)
    
    df.reset_index(drop=True, inplace=True)
    #시간변수 생성
    df['lag'+str(past)+'_'+var_name] = np.nan #결측치로 채워 넣어 놓는다
    df['lag'+str(past)+'_'+'AB'] = np.nan
    
    for col in ['AB',var_name]:
        for i in range(0, (max(df.index)+1)):
            val=df.loc[(df['batter_name']==df['batter_name'][i])& #이름이 가르시아 이면서
                       (df['year']==df['year'][i]-past),col]   #년도는 i년도
            #과거 기록이 결측치가 아니라면 값을 넣기
            if len(val)!=0:
                df.loc[i,'lag'+str(past)+'_'+col]=val.iloc[0] #i번째 행에 삽입
    
    #30타수 미만 결측치 처리
    df.loc[df['lag'+str(past)+'_'+'AB']<30,
          'lag'+str(past)+'_'+var_name]=np.nan   #var_name 행의 존재하는 30미만은 제거하고
    df.drop('lag'+str(past)+'_'+'AB', axis=1, inplace=True)  #AB열을 제거 하여 var_name만 남김
    return df

과거 시간을 for문으로 채우기 전에 결측치로 채워 넣은 점-> 과거 기록이 없으면 결측치로 채우기 위함

# 상관관계를 탐색할 변수 선택
numberics =['int16','int32','int64','float16','float32','float64']
numberics_cols=list(regular_season_df.select_dtypes(include=numberics).drop(['batter_id','year','OPS','SLG'], axis=1).columns)
regular_season_temp=regular_season_df[numberics_cols+['year','batter_name']].copy()
regular_season_temp= regular_season_temp.loc[regular_season_temp['AB']>=30]

# #시간변수 생성 함수를 통한 지표별 1년 전 성적 추출
for col in numberics_cols:
    regular_season_temp=lag_function(regular_season_temp,col ,1)
    
numberics_cols.remove('OBP')
regular_season_temp.drop(numberics_cols, axis=1, inplace=True)

#상관관계 도출
corr_matrix= regular_season_temp.corr()
corr_matrix= corr_matrix.sort_values(by='OBP', axis=0, ascending=False)
corr_matrix= corr_matrix[corr_matrix.index]

#상관관계의 시각적 표현
f, ax = plt.subplots(figsize=(12,12))
corr= regular_season_temp.select_dtypes(exclude=['object','bool']).corr()

#대각 행렬을 기준으로 한쪽만 설정
mask= np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True

g= sns.heatmap(corr_matrix, cmap='RdYlGn_r', vmax=1, mask=mask, center=0, annot=True, fmt='.2f', square=True, linewidths=.5, cbar_kws={'shrink':.5})
plt.title('Diagonal Correlation Heatmap')

#희생 플라이 구하기

#OBP(출루율) 계산 공식 이용하여 SF(희생 플라이) 계산 > (H+BB+HBP)/OBP-(AB+BB+HBP)
regular_season_df['SF']= regular_season_df[['H','BB','HBP']].sum(axis=1)/ regular_season_df['OBP']-\
regular_season_df[['AB','BB','HBP']].sum(axis=1)

regular_season_df['SF'].fillna(0, inplace=True) #결측치 채우기
regular_season_df['SF']=regular_season_df['SF'].apply(lambda x: round(x,0)) 

#한 타수당 평균 희생 플라이 계산 후 필요한 것만 추출
#regular_season_df는 각 연도별 전체 데이터이다. 
#이를통해서 한 타수당 평균 플라이 계산 후 일일 데이터에서 상반기데이터만 취하여 희생플라이 계산 ->  선수 별 상반기 출루율 계산
regular_season_df['SF_1']=regular_season_df['SF']/regular_season_df['AB']
regular_season_df_SF=regular_season_df[['batter_name','year','SF_1']]
regular_season_df_SF

#day_by_day_df에서 연도별 선수의 시즌 상반기 출루율과 관련된 성적 합 구하기
sum_hf_yr_OBP=day_by_day_df.loc[day_by_day_df['date']<=7.18].groupby(['batter_name','year'])['AB','H','BB','HBP'].sum().reset_index()

#day_by_day_df와 regular_season에서 구한 희생 플라이 관련 데이터 합치기
sum_hf_yr_OBP=sum_hf_yr_OBP.merge(regular_season_df_SF, how='left', on=['batter_name','year'])

#선수별 상반기 희생 플라이 수 계산
sum_hf_yr_OBP['SF']=(sum_hf_yr_OBP['SF_1']*sum_hf_yr_OBP['AB']).apply(lambda x:round(x,0))
sum_hf_yr_OBP.drop('SF_1', axis=1, inplace=True) #SF_1 삭제

#선수별 상반기 OBP(출루율)계산
sum_hf_yr_OBP['OBP']=sum_hf_yr_OBP[['H','BB','HBP']].sum(axis=1)/ sum_hf_yr_OBP[['AB','BB','HBP','SF']].sum(axis=1)

#OBP 결측치를 0으로 처리
sum_hf_yr_OBP['OBP'].fillna(0,inplace=True)

#분석에 필요하지 않은 열 제거
sum_hf_yr_OBP = sum_hf_yr_OBP[['batter_name','year','AB','OBP']]
sum_hf_yr_OBP

추가 변수 생성

#나이 변수 생성
regular_season_df['age']=regular_season_df['year']-regular_season_df['year_born'].apply(lambda x:int(x[:4]))

#나이,평균 출루율,출루율 중앙값으로 구성된 데이터프레임 구축
temp_df=regular_season_df.loc[regular_season_df['AB']>=30].groupby('age').agg({'OBP':['mean','median']}).reset_index()
temp_df.columns= temp_df.columns.droplevel()
temp_df.columns=['age','mean_OBP','median_OBP']

#나이에 따른 출루율 시각화
plt.figure(figsize=(12,8))
plt.plot('age','mean_OBP', data=temp_df, marker='o', markerfacecolor='red', markersize=12, color='skyblue', linewidth=4)
plt.ylabel('평균OBP')
plt.xlabel('나이')
plt.show()

#나이를 포함한 변수 선택
sum_hf_yr_OBP=sum_hf_yr_OBP.merge(regular_season_df[['batter_name','year','age']],
                                 how='left',on=['batter_name','year'])

#총 3년 전 성적까지 변수를 생성
sum_hf_yr_OBP= lag_function(sum_hf_yr_OBP,'OBP',1)
sum_hf_yr_OBP= lag_function(sum_hf_yr_OBP,'OBP',2)
sum_hf_yr_OBP= lag_function(sum_hf_yr_OBP,'OBP',3)
sum_hf_yr_OBP

데이터 사후 처리

round(sum_hf_yr_OBP[['lag1_OBP','lag2_OBP','lag3_OBP']].isna().sum()/ sum_hf_yr_OBP.shape[0],2)

#1. 선수별 OBP 평균
#SF = (H+BB+HBP)/OBP-(AB+BB+HBP)
#OBP = (H+BB+HBP) / (AB+BB+HBP+SF)
player_OBP_mean= regular_season_df.loc[regular_season_df['AB']>=30].groupby('batter_name')['AB','H','BB','HBP','SF'].sum().reset_index()
player_OBP_mean['mean_OBP']=player_OBP_mean[['H','BB','HBP']].sum(axis=1)/player_OBP_mean[['AB','BB','HBP','SF']].sum(axis=1)

#2. 시즌별 OBP평균 
season_OBP_mean=regular_season_df.loc[regular_season_df['AB']>=30].groupby('year')['AB','H','BB','HBP','SF'].sum().reset_index()
season_OBP_mean['mean_OBP']=season_OBP_mean[['H','BB','HBP']].sum(axis=1)/season_OBP_mean[['AB','BB','HBP','SF']].sum(axis=1)
season_OBP_mean=season_OBP_mean[['year','mean_OBP']]

##player_OBP_mean(선수별 평균) 열 추가
sum_hf_yr_OBP=sum_hf_yr_OBP.merge(player_OBP_mean[['batter_name','mean_OBP']], how='left', on='batter_name')

#선수평균의 성적이 결측치이면 데이터에서 제거
sum_hf_yr_OBP=sum_hf_yr_OBP.loc[~sum_hf_yr_OBP['mean_OBP'].isna()].reset_index(drop=True)  #~하면 False 가 True로 반전됨, 평균값이 없는 선수는 제외 시키기위함?
sum_hf_yr_OBP

#결측치 처리하는 함수 정의
def lag_na_fill(data_set,var_name,past,season_var_mean_data):
    #data_set: 이용할 데이터 셋
    #var_name: 시간변수르 만들 변수 이름
    #season_var_mean_data: season별로 var_name의 평균을 구한 데이터
    
    for i in range(0, len(data_set)):
        if np.isnan(data_set['lag'+str(past)+'_'+var_name][i]): # 결측치가 존재하면 True를 반환
                                                                 #선수별 var_name 평균    +     #시즌별 var_name평균     
            data_set.loc[i,'lag'+str(past)+'_'+var_name]=(data_set.loc[i,'mean_'+var_name]+season_var_mean_data.loc[season_var_mean_data['year']==\
                                                           (data_set['year'][i]-past),'mean_'+var_name].iloc[0])/2
        
    return data_set

결측치를 (선수별 평균+시즌별 평균)/2로 대체한다는 인사이트
np.isnan

season_OBP_mean.loc[season_OBP_mean['year']==1993,'mean_'+'OBP']

0    0.333333
Name: mean_OBP, dtype: float64

season_OBP_mean.head(2)

#생성한 함수를 이용해 결측치 처리
sum_hf_yr_OBP=lag_na_fill(sum_hf_yr_OBP,'OBP',1,season_OBP_mean) #1년 전 성적 대체

sum_hf_yr_OBP=lag_na_fill(sum_hf_yr_OBP,'OBP',2,season_OBP_mean) #2년 전 성적 대체

sum_hf_yr_OBP=lag_na_fill(sum_hf_yr_OBP,'OBP',3,season_OBP_mean) #3년 전 성적 대체
sum_hf_yr_OBP

과거 성적데이터를 쓴다는 인사이트와 구현하는 코드
함수를 정의하는 과정 방법

SLG 데이터 처리

#상관관계를 탐색할 변수 선택

numberics_cols=list(regular_season_df.select_dtypes(include=numberics).drop(['batter_id','year','OPS','OBP'], axis=1).columns)
regular_season_temp = regular_season_df[numberics_cols+['year','batter_name']].copy()
regular_season_temp=regular_season_temp.loc[regular_season_temp['AB']>=30]

#시간변수 생성 함수를 통한 지표별 1년 전 성적추출
for col in numberics_cols:
    regular_season_temp=lag_function(regular_season_temp,col,1)
numberics_cols.remove('SLG') #SLG를 상관관계표에서 비교해야 하므로 미리 삭제목록에서 제외시킨다
regular_season_temp.drop(numberics_cols, axis=1,inplace=True)

#상관관계 도출
corr_matrix=regular_season_temp.corr()
corr_matrix=corr_matrix.sort_values(by='SLG', axis=0, ascending=False)
corr_matrix=corr_matrix[corr_matrix.index]

#상관관계 시각적 표현
f,ax = plt.subplots(figsize=(12,12)) #fig 사이즈, ax : axes 생성된 그래프 낱낱개
corr=regular_season_temp.select_dtypes(exclude=['object','bool']).corr()

#대각 행렬을 기준으로 한쪽만 설정
mask= np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True

g= sns.heatmap(corr_matrix, cmap='RdYlGn_r', vmax=1, mask=mask, center=0, annot=True, fmt='.2f', square=True, linewidths=.5, cbar_kws={'shrink':.5})
plt.title('Diagonal Correlation Heatmap')

SLG 예측에 필요한 변수를 파악하기 위한 상관성

#day_by_day_df에서 연도별 선수의 시즌 상반기 장타율과 관련된 성적 합 구하기
sum_hf_yr_SLG=day_by_day_df.loc[day_by_day_df['date']<=7.18].groupby(['batter_name','year'])['AB','H','2B','3B','HR'].sum().reset_index()

#상반기 장타율 계산                                                 #sum(axis=1) :행 별로 더해진다#
sum_hf_yr_SLG['SLG']=(sum_hf_yr_SLG['H']-sum_hf_yr_SLG[['2B','3B','HR']].sum(axis=1)+sum_hf_yr_SLG['2B']*2+sum_hf_yr_SLG['3B']*3+\
                      sum_hf_yr_SLG['HR']*4)/sum_hf_yr_SLG['AB']

#SLG결측치를 0으로 처리
sum_hf_yr_SLG['SLG'].fillna(0, inplace=True)

#필요한 칼럼만 불러오고 나이계산
sum_hf_yr_SLG=sum_hf_yr_SLG[['batter_name','year','AB','SLG']]
sum_hf_yr_SLG=sum_hf_yr_SLG.merge(regular_season_df[['age','batter_name','year']], how='left', on=['batter_name','year'] )
sum_hf_yr_SLG.head()

# 총 3년 전 성적까지 변수를 생성
sum_hf_yr_SLG=lag_function(sum_hf_yr_SLG,'SLG',1)
sum_hf_yr_SLG=lag_function(sum_hf_yr_SLG,'SLG',2)
sum_hf_yr_SLG=lag_function(sum_hf_yr_SLG,'SLG',3)

display(sum_hf_yr_SLG.head())

#전체 데이터에서 결측치가 차지하는 비율보기
round(sum_hf_yr_SLG[['lag1_SLG','lag2_SLG','lag3_SLG']].isna().sum()/sum_hf_yr_SLG.shape[0],2)

#결측치를 시즌성적,선수의 평균 성적을 이용해 결측치 처리

#선수별 SLG평균 데이터(player_SLG_mean) 생성
player_SLG_mean= regular_season_df.loc[regular_season_df['AB']>=30].groupby('batter_name')['AB','H','2B','3B','HR'].sum().reset_index()

player_SLG_mean['mean_SLG']= (player_SLG_mean['H']-player_SLG_mean[['2B','3B','HR']].sum(axis=1)+player_SLG_mean['2B']*2+player_SLG_mean['3B']*3+\
                      player_SLG_mean['HR']*4)/player_SLG_mean['AB']

#시즌별 SLG 평균 데이터(season_SLG_mean) 생성
season_SLG_mean=regular_season_df.loc[regular_season_df['AB']>=30].groupby('year')['AB','H','2B','3B','HR'].sum().reset_index()
season_SLG_mean['mean_SLG']=(season_SLG_mean['H']-season_SLG_mean[['2B','3B','HR']].sum(axis=1)+season_SLG_mean['2B']*2+season_SLG_mean['3B']*3+\
                      season_SLG_mean['HR']*4)/season_SLG_mean['AB']

#선수 평균의 SLG(player_SLG_mean)를 새로운 변수에 더한다
sum_hf_yr_SLG=sum_hf_yr_SLG.merge(player_SLG_mean[['batter_name','mean_SLG']], how='left', on='batter_name')

#선수 평균의 성적이 결측치이면 데이터에서 제거
sum_hf_yr_SLG=sum_hf_yr_SLG.loc[~sum_hf_yr_SLG['mean_SLG'].isna()].reset_index(drop=True) #mean_SLG가 있는 것만 추출해서 인덱스 정리
sum_hf_yr_SLG

#결측치 처리
sum_hf_yr_SLG=lag_na_fill(sum_hf_yr_SLG,'SLG',1, season_SLG_mean) #1년전 성적 대체
sum_hf_yr_SLG=lag_na_fill(sum_hf_yr_SLG,'SLG',2, season_SLG_mean) #2년전 성적 대체
sum_hf_yr_SLG=lag_na_fill(sum_hf_yr_SLG,'SLG',3, season_SLG_mean) #3년전 성적 대체

display(sum_hf_yr_SLG.head())
round(sum_hf_yr_SLG[['lag1_SLG','lag2_SLG','lag3_SLG']].isna().sum()/sum_hf_yr_SLG.shape[0],2)

모델구축과 검증

lasso,RIdge

#30태수 이상의 데이터만 학습
sum_hf_yr_OBP=sum_hf_yr_OBP.loc[sum_hf_yr_OBP['AB']>=30]
sum_hf_yr_SLG=sum_hf_yr_SLG.loc[sum_hf_yr_SLG['AB']>=30]

#2018년 데이터를 test 데이터로, 2018 이전은 train 데이터로 나눈다
OBP_train= sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year']!=2018]
OBP_test= sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year']==2018]

SLG_train= sum_hf_yr_SLG.loc[sum_hf_yr_SLG['year']!=2018]
SLG_test= sum_hf_yr_SLG.loc[sum_hf_yr_SLG['year']==2018]
print(OBP_train.shape,OBP_test.shape,SLG_train.shape,SLG_test.shape)

(872, 9) (150, 9) (872, 9) (150, 9)

#평가지표 정의
def wrmse(v,w,p):
    #v : 실제값
    #w : 타수 
    #p : 예측값
    return sum(np.sqrt(((v-p)**2*w)/sum(w)))

이 대회에서만 쓰인 평가지표

#랏지와 라소 선형모델
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

#log 단위(1e+01)로 1.e-04 ~1.e+01 사이의 구간에 대해 parameter를 탐색한다
lasso_params={'alpha':np.logspace(-4,1,6)} #array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])
ridge_params={'alpha':np.logspace(-4,1,6)}

#GridSeachCV를 이용하여 dict에 lasso,Ridge OBP 모델을 저장한다.
OBP_linear_models={
    'Lasso':GridSearchCV(Lasso(), param_grid=lasso_params).fit(OBP_train.iloc[:,-5:],OBP_train['OBP']).best_estimator_,
    'Ridge':GridSearchCV(Ridge(), param_grid=lasso_params).fit(OBP_train.iloc[:,-5:],OBP_train['OBP']).best_estimator_,
}

#GridSeachCV를 이용하여 dict에 lasso,Rigde SLG 모델을 저장한다.
SLG_linear_models={
    'Lasso':GridSearchCV(Lasso(), param_grid=lasso_params).fit(SLG_train.iloc[:,-5:],SLG_train['SLG']).best_estimator_,
    'Ridge':GridSearchCV(Ridge(), param_grid=lasso_params).fit(SLG_train.iloc[:,-5:],SLG_train['SLG']).best_estimator_,
}

Randomforest

import time
from sklearn.ensemble import RandomForestRegressor
start=time.time() #시작시간

#랜덤 포레스트의 파라미터 범위 정의 
RF_params = {
    "n_estimators":[50,100,150,200,300,500,100],
    "max_features":['auto','sqrt'],
    "max_depth":[1,2,3,4,5,6,10],
    "min_samples_leaf":[1,2,4],
    "min_samples_split":[2,3,5,10]}

#GridsearchCV를 이용하여 dict에 OBP RF 모델을 저장
OBP_RF_models={
    "RF":GridSearchCV(
    RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(OBP_train.iloc[:,-5:],OBP_train['OBP']).best_estimator_}

#GridsearchCV를 이용하여 dict에 SLG RF 모델을 저장
SLG_RF_models={
    "RF":GridSearchCV(
    RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(SLG_train.iloc[:,-5:],SLG_train['SLG']).best_estimator_}

print(f"걸린시간 : {np.round(time.time() -start,3)}초") #현재시간-시작시간(단위 초)

XGBoost

import xgboost as xgb
# from xgboost import XGBRegressor
start=time.time()

#xgboost parameter space를 정의
XGB_params={
    'min_child_weight':[1,3,5,10],
    'gamma':[0.3,0.5,1,1.5,2,5],
    'subsample':[0.6,0.8,1.0],
    'colsample_bytree':[0.6,0.8,1.0],
    'max_depth':[3,4,5,7,10]}

#GridSearchCV를 통해 파라미터를 탐색 정의한다
XGB_OBP_gridsearch= GridSearchCV(xgb.XGBRegressor(random_state=42),
                                param_grid=XGB_params, n_jobs=-1)

XGB_SLG_gridsearch= GridSearchCV(xgb.XGBRegressor(random_state=42),
                                param_grid=XGB_params, n_jobs=-1)

#모델 학습
XGB_OBP_gridsearch.fit(OBP_train.iloc[:,-5:],OBP_train['OBP'])
XGB_SLG_gridsearch.fit(SLG_train.iloc[:,-5:],SLG_train['SLG'])

print(f"걸린시간 : {np.round(time.time() -start,3)}초")

알고리즘별 성능비교

#테스트 데이터셋(2018년)의 선수들의 OBP예측
Lasso_OBP=OBP_linear_models['Lasso'].predict(OBP_test.iloc[:,-5:])
Ridge_OBP=OBP_linear_models['Ridge'].predict(OBP_test.iloc[:,-5:])
RF_OBP=OBP_RF_models['RF'].predict(OBP_test.iloc[:,-5:])
XGB_OBP=XGB_OBP_gridsearch.predict(OBP_test.iloc[:,-5:])

#test 데이터의 WRMSE 계산
wrmse_score=[wrmse(OBP_test['OBP'],OBP_test['AB'],Lasso_OBP),  #실제값,타수,예측값 순
            wrmse(OBP_test['OBP'],OBP_test['AB'],Ridge_OBP),
            wrmse(OBP_test['OBP'],OBP_test['AB'],RF_OBP),
            wrmse(OBP_test['OBP'],OBP_test['AB'],XGB_OBP)]

x_lab=['Lasso','Ridge','RF','XGB']

plt.bar(x_lab,wrmse_score)
plt.title('WRMSE of OBP', fontsize=20)
plt.xlabel('model', fontsize=8)
plt.ylabel("",fontsize=18)
plt.ylim(0,0.5)

#막대 그래프 위에 값 표시
for i,v in enumerate(wrmse_score):
    plt.text(i-0.1,v+0.01,str(np.round(v,3))) #x좌표,y좌표, 텍스트
plt.show()

#테스트 데이터셋(2018년)의 선수들의 SLG예측
Lasso_SLG=SLG_linear_models['Lasso'].predict(SLG_test.iloc[:,-5:])
Ridge_SLG=SLG_linear_models['Ridge'].predict(SLG_test.iloc[:,-5:])
RF_SLG=SLG_RF_models['RF'].predict(SLG_test.iloc[:,-5:])
XGB_SLG=XGB_SLG_gridsearch.predict(SLG_test.iloc[:,-5:])

#test 데이터의 WRMSE 계산
wrmse_score_SLG=[wrmse(SLG_test['SLG'],SLG_test['AB'],Lasso_SLG),  #실제값,타수,예측값 순
            wrmse(SLG_test['SLG'],SLG_test['AB'],Ridge_SLG),
            wrmse(SLG_test['SLG'],SLG_test['AB'],RF_SLG),
            wrmse(SLG_test['SLG'],SLG_test['AB'],XGB_SLG)]

x_lab=['Lasso','Ridge','RF','XGB']

plt.bar(x_lab,wrmse_score_SLG)
plt.title('WRMSE of SLG', fontsize=20)
plt.xlabel('model', fontsize=8)
plt.ylabel("",fontsize=18)
plt.ylim(0,0.9)

#막대 그래프 위에 값 표시
for i,v in enumerate(wrmse_score_SLG):
    plt.text(i-0.1,v+0.01,str(np.round(v,3))) #x좌표,y좌표, 텍스트
plt.show()

결과해석 및 평가
- 변수의 중요도를 랜덤포레스트를 통해 알수가 있다

plt.figure(figsize=(15,6))

#가로막대 그래프
plt.subplot(1,2,1)
plt.barh(OBP_train.iloc[:,-5:].columns, OBP_RF_models['RF'].feature_importances_)
plt.title('Feature importance of RF in OBP')

plt.subplot(1,2,2)
plt.barh(SLG_train.iloc[:,-5:].columns, SLG_RF_models['RF'].feature_importances_)
plt.title('Feature importance of RF in SLG')
plt.show()

라쏘와 릿지 회귀모델

#Lasso에서 GridSearchCV로 탐색한 최적의 alpha값 출력
print('Alpha:',OBP_linear_models['Lasso'].alpha)

#Lasso model의 선형계수 값 출력
display(pd.DataFrame(OBP_linear_models['Lasso'].coef_.reshape(-1,5),
                    columns=OBP_train.iloc[:,-5:].columns, index=['coefficient']))

#Lasso에서 GridSearchCV로 탐색한 최적의 alpha값 출력
print('Alpha:',SLG_linear_models['Lasso'].alpha)

#Lasso model의 선형계수 값 출력
display(pd.DataFrame(SLG_linear_models['Lasso'].coef_.reshape(-1,5),
                    columns=SLG_train.iloc[:,-5:].columns, index=['coefficient']))

from sklearn.linear_model import lars_path

plt.figure(figsize=(15,4.8))
plt.subplot(1,2,1)

#OBP모델의 alpha값의 변화에 따른 계수의 변화를 alpha,coefs에 저장
alphas,_,coefs = lars_path(OBP_train.iloc[:,-5:].values, OBP_train['OBP'], method='lasso',verbose=True)

#피처별 alpha값에 따른 선형 모델 계수의 절댓값의 합 
xx= np.sum(np.abs(coefs.T),axis=1) #coefs.T.shape : (6,5), axis=1하니까 행으로 합쳐지는 것같다. 총 6개 값이 나온다

#계수의 절댓값 중 가장 큰 값으로 alpha에 따른 피처의 계수의 합을 나눈다
xx/=xx[-1] #0.81069777을 각 원소에 나눈다

plt.plot(xx, coefs.T)
plt.xlabel('|coef|/max|coef|')
plt.ylabel('cofficients')
plt.title('OBP LASSO path')
plt.axis('tight')
plt.legend(OBP_train.iloc[:,-5:].columns)

plt.subplot(1,2,2)
#SLG모델에서 alptha값의 변화에 따른 계수의 변화를 alpha, coefs에 저장
alphas,_,coefs = lars_path(SLG_train.iloc[:,-5:].values, SLG_train['SLG'], method='lasso',verbose=True)

#피처별 alpha값에 따른 선형 모델 계수의 절댓값의 합 
xx= np.sum(np.abs(coefs.T),axis=1)

#계수의 절댓값 중 가장 큰 값으로 alpha에 따른 피처의 계수의 합을 나눈다
xx/=xx[-1]

plt.plot(xx, coefs.T)
plt.xlabel('|coef|/max|coef|')
plt.ylabel('cofficients')
plt.title('SLG LASSO path')
plt.axis('tight')
plt.legend(SLG_train.iloc[:,-5:].columns)
plt.show()

위 그래프 원리는 좀더 공부가 필요해 보임

앙상블

print('OBP model averaging:', wrmse(OBP_test['OBP'], OBP_test['AB'],(Lasso_OBP+RF_OBP)/2))
print('SLG model averaging:', wrmse(SLG_test['SLG'], SLG_test['AB'],(Lasso_SLG+RF_SLG)/2))

OBP model averaging: 0.3181395239559683
SLG model averaging: 0.6717303946958075

단순화된 모델 생성

#전처리된 데이터를 다른 곳에 저장
sum_hf_yr_OBP_origin=sum_hf_yr_OBP.copy()

#전체 희생 플라이 계산
regular_season_df_SF['SF']=regular_season_df[['H','BB','HBP']].sum(axis=1)/regular_season_df['OBP']-regular_season_df[['AB','BB','HBP']].sum(axis=1)
regular_season_df['SF'].fillna(0, inplace=True) #결측값은 0으로
regular_season_df['SF']=regular_season_df['SF'].apply(lambda x:round(x,0)) #정수형태로 변경

#한 타수당 평균 희생 플라이 계산 후 필요한 것만 추출 
regular_season_df['SF_1']=regular_season_df['SF']/regular_season_df['AB']
regular_season_df_SF=regular_season_df[['batter_name','year','SF_1']]

# day_by_day_df에서 연도별 선수의 시즌 상반기 출루율과 관련된 성적 합 구하기 +BB,RBI 추가
sum_hf_yr_OBP= day_by_day_df.loc[day_by_day_df['date']<=7.18].groupby(['batter_name','year'])['AB','H','BB','HBP','RBI','2B','3B','HR'].sum().reset_index()

# day_by_day_df와 regular_season에서 구한 희생플라이 관련 데이터 합치기
sum_hf_yr_OBP=sum_hf_yr_OBP.merge(regular_season_df_SF, how='left', on=['batter_name','year'])

#한 타수당 평군 희생플라이 계산, 정규시즌에서 구한 희생플라이 비율을 일일데이터에 적용 
sum_hf_yr_OBP['SF']=(sum_hf_yr_OBP['SF_1']*sum_hf_yr_OBP['AB']).apply(lambda x:round(x,0))
sum_hf_yr_OBP.drop('SF_1', axis=1, inplace=True)

#상반기 OBP(출루율)
sum_hf_yr_OBP['OBP']=sum_hf_yr_OBP[['H','BB','HBP']].sum(axis=1)/sum_hf_yr_OBP[['AB','BB','HBP','SF']].sum(axis=1)
sum_hf_yr_OBP['OBP'].fillna(0, inplace=True)

#TB계산
sum_hf_yr_OBP['TB']=sum_hf_yr_OBP['H']+sum_hf_yr_OBP['2B']*2+sum_hf_yr_OBP['3B']*3+sum_hf_yr_OBP['HR']*4
sum_hf_yr_OBP= sum_hf_yr_OBP[['batter_name', 'year','AB','OBP','BB','TB','RBI']]

#나이추가
sum_hf_yr_OBP=sum_hf_yr_OBP.merge(regular_season_df[['batter_name', 'year','age']],
                                 how='left', on=['batter_name','year'])

#평균 OBP추가
sum_hf_yr_OBP = sum_hf_yr_OBP.merge(player_OBP_mean[['batter_name','mean_OBP']], how='left', on='batter_name')

sum_hf_yr_OBP=sum_hf_yr_OBP.loc[~sum_hf_yr_OBP['mean_OBP'].isna()].reset_index(drop=True)

#각 변수에 대한 1년 전 성적 생성
sum_hf_yr_OBP=lag_function(sum_hf_yr_OBP,'BB',1)
sum_hf_yr_OBP=lag_function(sum_hf_yr_OBP,'TB',1)
sum_hf_yr_OBP=lag_function(sum_hf_yr_OBP,'RBI',1)
sum_hf_yr_OBP=lag_function(sum_hf_yr_OBP,'OBP',1)

sum_hf_yr_OBP=sum_hf_yr_OBP.dropna() #결측치 포함한 행 제거

#변수리스트 지정
feature_list1=['age','lag1_OBP','mean_OBP']
feature_list2=['age','lag1_OBP','lag1_BB','lag1_TB','lag1_RBI','lag1_OBP','mean_OBP']

#학습시킬 데이터 30타수 이상만 학습
sum_hf_yr_OBP= sum_hf_yr_OBP.loc[sum_hf_yr_OBP['AB']>=30]

#2018 test로 나누고 나머지는 학습
OBP_train=sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year']!=2018]
OBP_test=sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year']==2018]

#gridSearch를 이용한 학습
OBP_RF_models_1={
    'RF':GridSearchCV(RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(OBP_train.loc[:,feature_list1], OBP_train['OBP']).best_estimator_
}

OBP_RF_models_2={
    'RF':GridSearchCV(RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(OBP_train.loc[:,feature_list2], OBP_train['OBP']).best_estimator_
}

#예측
RF_OBP1= OBP_models_1['RF'].predict(OBP_test.loc[:,feature_list1])
RF_OBP2= OBP_models_2['RF'].predict(OBP_test.loc[:,feature_list2])

#wrmse 계산
wrmse_score= [wrmse(OBP_test['OBP'], OBP_test['AB'],RF_OBP1),
             wrmse(OBP_test['OBP'], OBP_test['AB'],RF_OBP2)]
x_lab=['simple','complicate']

plt.bar(x_lab, wrmse_score)
plt.title('WRMSE of OBP', fontsize=20)
plt.xlabel('model', fontsize=18)
plt.xlabel('', fontsize=18)
plt.ylim(0,0.5)

#막대그래프 위에 값 표시
for i,v in enumerate(wrmse_score):
    plt.text(i-0.1, v+0.01, str(np.round(v,3)))
plt.show()

#최종 제출을 위한 워래 데이터 복구
sum_hf_yr_OBP=sum_hf_yr_OBP_origin.copy()

테스트 데이터 정제

submission=pd.read_csv('D:/dacon/KBO 타자 OPS 예측/submission.csv')
submission['year']=2019 

#2019년의 age계산
batter_year_born=regular_season_df[['batter_id','batter_name','year_born']].copy()

#중복선수 제거
batter_year_born=batter_year_born.drop_duplicates().reset_index(drop=True)

submission=submission.merge(batter_year_born, how='left', on=['batter_id','batter_name'])
submission['age']=submission['year']-submission['year_born'].apply(lambda x: int(x[:4]))
submission.head()

# submission OBP,SLG 파일 2개로 만들어 합치기
submission_OBP=submission.copy()
submission_SLG=submission.copy()

OBP

# 앞서 전처리한 데이터를 이용해 평균 성적 기입
submission_OBP=submission_OBP.merge(sum_hf_yr_OBP[['batter_name','mean_OBP']].drop_duplicates().reset_index(drop=True), 
                        how='left', on='batter_name')

#과거 성적 값 채우기
for i in [1,2,3]:
    temp_lag_df=sum_hf_yr_OBP.loc[
        (sum_hf_yr_OBP['year']==(2019-i))&
        (sum_hf_yr_OBP['AB']>=30),['batter_name','OBP']].copy()
    temp_lag_df.rename(columns={'OBP':'lag'+str(i)+'_OBP'}, inplace=True)
    submission_OBP=submission_OBP.merge(temp_lag_df, how='left', on='batter_name')
submission_OBP.head()

case1
- 일별 데이터에 기록이 없어서 mean_OBP가 없는 경우
- 김주찬,이범호

for batter_name in ['김주찬','이범호']:
    #30타수 이상인 해당선수의 인덱스
    cond_regular=(regular_season_df['AB']>=30) & (regular_season_df['batter_name']==batter_name)
    
    #타수를 고려해 평균 OBP계산
    mean_OBP= sum(regular_season_df.loc[cond_regular,'AB']*\
                  regular_season_df.loc[cond_regular,'OBP'])/\
    sum(regular_season_df.loc[cond_regular,'AB'])
    
    submission_OBP.loc[(submission_OBP['batter_name']==batter_name),'mean_OBP']=mean_OBP #계산한 평균값으로 대체
    
    #regular_season_df으로부터 1,2,3년전 성적 구하기
    cond_sub=submission_OBP['batter_name']==batter_name
    
    #타수가 30이면서 김주찬,이범호인 사람의 2018년 기록을 lag1_OBP에 삽입
    submission_OBP.loc[cond_sub,'lag1_OBP']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2018),'OBP'].values
    
    #타수가 30이면서 김주찬,이범호인 사람의 2017년 기록을 lag1_OBP에 삽입
    submission_OBP.loc[cond_sub,'lag1_OBP']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2017),'OBP'].values
    
    #타수가 30이면서 김주찬,이범호인 사람의 2016년 기록을 lag1_OBP에 삽입
    submission_OBP.loc[cond_sub,'lag1_OBP']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2016),'OBP'].values

case2
- 1998년 혹은 1999년 출생의 신인급 선수
- 성장가능성을 기대 할수 있으므로 2018년 시즌의 성적으로 출루율의 평균을 대체

for i in np.where(submission_OBP['batter_name'].isin(['고명성','전민재','김철호','신범수','이병휘'])):
    submission_OBP.loc[i,'mean_OBP']=season_OBP_mean.loc[season_OBP_mean['year']==2018,'mean_OBP']

case3

- 2018년 하반기 성적만 있는 경우
- 정규시즌 성적을 바탕으로 평균 출루율 / 1년 전 출루율 수치를 대체

for batter_name in ['전병우','샌즈']:
    #30타수 이상인 해당 선수의 index추출
    cond_regular=(regular_season_df['AB']>=30)&(regular_season_df['batter_name']==batter_name)
    
    #타수를 고려해 선수의 평균 OBP 계산
    mean_OBP=sum(regular_season_df.loc[cond_regular, 'AB']* regular_season_df.loc[cond_regular,'OBP'])/\
            sum(regular_season_df.loc[cond_regular,'AB'])
    
    submission_OBP.loc[(submission_OBP['batter_name']==batter_name),'mean_OBP']=mean_OBP
    
    #2018년 데이터로부터 2019년 1년 전 성적 기입
    cond_sub= submission_OBP['batter_name']==batter_name
    submission_OBP.loc[cond_sub,'lag1_OBP']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2018),'OBP'].values

case3

- 은퇴 혹은 1군 수준의 성적을 보여주지 못한 선수
- 하위25%의 성적으로 대체

#평균 성적이 결측치인 선수들에 대해 평균 OBP의 하위25% 성적 기입
submission_OBP.loc[submission_OBP['mean_OBP'].isna(),'mean_OBP']=np.quantile(player_OBP_mean['mean_OBP'],0.25)

#과거 데이터 채우기
for i in [1,2,3]:
    #i년 전 OBP 결측치 제거 
    submission_OBP=lag_na_fill(submission_OBP,'OBP',i,season_OBP_mean)
submission_OBP.head()

SLG

# 앞서 전처리한 데이터를 이용해 평균 성적 기입
submission_SLG=submission_SLG.merge(sum_hf_yr_SLG[['batter_name','mean_SLG']].drop_duplicates().reset_index(drop=True), 
                        how='left', on='batter_name')

#과거 성적 값 채우기
for i in [1,2,3]:
    temp_lag_df=sum_hf_yr_SLG.loc[
        (sum_hf_yr_SLG['year']==(2019-i))&
        (sum_hf_yr_SLG['AB']>=30),['batter_name','SLG']].copy()
    temp_lag_df.rename(columns={'SLG':'lag'+str(i)+'_SLG'}, inplace=True)
    submission_SLG=submission_SLG.merge(temp_lag_df, how='left', on='batter_name')
submission_SLG.head()

submission_SLG['batter_name'].loc[submission_SLG['mean_SLG'].isna()].values

#case1
for batter_name in ['김주찬','이범호']:
    #30타수 이상인 해당선수의 인덱스
    cond_regular=(regular_season_df['AB']>=30) & (regular_season_df['batter_name']==batter_name)
    
    #타수를 고려해 평균 OBP계산
    mean_SLG= sum(regular_season_df.loc[cond_regular,'AB']*\
                  regular_season_df.loc[cond_regular,'SLG'])/\
    sum(regular_season_df.loc[cond_regular,'AB'])
    
    submission_SLG.loc[(submission_SLG['batter_name']==batter_name),'mean_SLG']=mean_SLG #계산한 평균값으로 대체
    
    #regular_season_df으로부터 1,2,3년전 성적 구하기
    cond_sub=submission_SLG['batter_name']==batter_name
    
    #타수가 30이면서 김주찬,이범호인 사람의 2018년 기록을 lag1_OBP에 삽입
    submission_SLG.loc[cond_sub,'lag1_SLG']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2018),'SLG'].values
    
    #타수가 30이면서 김주찬,이범호인 사람의 2017년 기록을 lag1_OBP에 삽입
    submission_SLG.loc[cond_sub,'lag1_SLG']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2017),'SLG'].values
    
    #타수가 30이면서 김주찬,이범호인 사람의 2016년 기록을 lag1_OBP에 삽입
    submission_SLG.loc[cond_sub,'lag1_SLG']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2016),'SLG'].values
    
    
#case2
for i in np.where(submission_SLG['batter_name'].isin(['고명성','전민재','김철호','신범수','이병휘'])):
    submission_SLG.loc[i,'mean_SLG']=season_SLG_mean.loc[season_SLG_mean['year']==2018,'mean_SLG']
    
#case3
for batter_name in ['전병우','샌즈']:
    #30타수 이상인 해당 선수의 index추출
    cond_regular=(regular_season_df['AB']>=30)&(regular_season_df['batter_name']==batter_name)
    
    #타수를 고려해 선수의 평균 OBP 계산
    mean_SLG=sum(regular_season_df.loc[cond_regular, 'AB']* regular_season_df.loc[cond_regular,'SLG'])/\
            sum(regular_season_df.loc[cond_regular,'AB'])
    
    submission_SLG.loc[(submission_SLG['batter_name']==batter_name),'mean_SLG']=mean_SLG
    
    #2018년 데이터로부터 2019년 1년 전 성적 기입
    cond_sub= submission_SLG['batter_name']==batter_name
    submission_SLG.loc[cond_sub,'lag1_SLG']=regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2018),'SLG'].values
#case4
#평균 성적이 결측치인 선수들에 대해 평균 SLG의 하위25% 성적 기입
submission_SLG.loc[submission_SLG['mean_SLG'].isna(),'mean_SLG']=np.quantile(player_SLG_mean['mean_SLG'],0.25)

#과거 데이터 채우기
for i in [1,2,3]:
    #i년 전 SLG 결측치 제거 
    submission_SLG=lag_na_fill(submission_SLG,'SLG',i,season_SLG_mean)
submission_SLG.head()

### OBP,SLG 둘다 lasso 모델에서 가장 좋은 성능을 보였으므로 lasso로 예측을 시행한다

#Lasso를 이용한 OBP 예측
predict_OBP=OBP_linear_models['Lasso'].predict(submission_OBP.iloc[:,-5:])
#Lasso를 이용한 SLG 예측
predict_SLG=SLG_linear_models['Lasso'].predict(submission_OBP.iloc[:,-5:])

final_submission=submission[['batter_id','batter_name']]
final_submission['OPS']=predict_SLG+predict_OBP #OBP+SLG= OPS
final_submission.head()

반발계수의 변화

#시즌별 전체 OBP 계산(30타수 이상인 선수들의 기록만 이용)
season_OBP=regular_season_df.loc[regular_season_df['AB']>=30].groupby('year').agg({'AB':'sum','H':'sum','BB':'sum','HBP':'sum','SF':'sum'}).reset_index()
season_OBP['OBP']=season_OBP[['H','BB','HBP']].sum(axis=1)/ season_OBP[['AB','BB','HBP','SF']].sum(axis=1)

#시즌별 전체 SLG 계산(30타수 이상인 선수들만의 기록만 사용)
season_SLG=regular_season_df.loc[regular_season_df['AB']>=30].groupby('year').agg({'AB':'sum','H':'sum','2B':'sum','3B':'sum','HR':'sum'}).reset_index()
season_SLG['SLG']=((season_SLG['H']- season_SLG[['2B','3B','HR']].sum(axis=1))+\
                  season_SLG['2B']*2+season_SLG['3B']*3+season_SLG['HR']*4)/season_SLG['AB']

#season_OBP와season_SLG 병합 후 season_OPS를 생성해 계산
season_OPS=pd.merge(season_OBP[['year','OBP']], season_SLG[['year','SLG']], on='year')
season_OPS['OPS']=season_OBP['OBP']+season_SLG['SLG']

#시즌별 전체 홈런 수와 한 선수당 평균 홈런 수 계산
season_HR=regular_season_df.loc[regular_season_df['AB']>=30].groupby('year').agg({'HR':['sum','mean','count']}).reset_index()
season_HR.columns=['year','sum_HR','mean_HR','count']

#기존의 OPB 데이터셋과 병합
season_OPS=season_OPS.merge(season_HR, on='year', how='left')
display(season_OPS)

# 2000년도 이전의 데이터 수가 충분치 않아 고려하지 않는다
season_OPS.loc[season_OPS['year']>2000]

#2018년의 평균 홈런 개수를 시즌별평균 홈런 수에서 뺀다
season_OPS['HR_diff']=season_OPS['mean_HR']-season_OPS['mean_HR'].iloc[-1]
difference=season_OPS.sort_values(by='HR_diff')[['year','OPS','HR_diff']]
display(difference.reset_index(drop=True).head(12))

final_submission['OPS'] =final_submission['OPS']-0.038
display(final_submission.head(10))
# final_submission.to_csv('submissionb.csv', index=False) #최종 제출 파일

이번 실습은 야구에 대한 도메인 지식이 부족한 상태에서 진행한거라 그런지 이해하기가 시간이 걸렸습니다

모델구축 부분에서는 제 자원이 부족해서 미처 실행하지 못한점이 아쉬웠습니다

이 실습을 통해서 결측치를 다루는 방법을 볼수 있었다는 점이 가장 좋았습니다

제 생각보다 결측치 다루는게 난이도가 있었습니다

개인 프로젝트 진행 시 좋은 참고가 될것같습니다

'실습 note' 카테고리의 다른 글

OpenCV_4(기하학적 변환) (0)	2021.02.22
버스 승차인원 예측 실습(데이콘 경진대회 1등 솔루션) (0)	2021.02.20
OpenCV_3(필터링) (0)	2021.02.19
OpenCV_2(기본 영상처리) (0)	2021.02.18
OpenCV_1(기초 사용법) (0)	2021.02.17

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

H_record

KBO 타자 OPS 예측 실습(데이콘 경진대회 1등 솔루션)