사용자 사전 추가된 Mecab 형태소 분석 in Corab

단어 통일화 및 사용자 사전 구축에 따른 mecab 형태소 분석을 Class로 정의하였습니다.

클래스 내 함수 def _morph는 변수명이 하드코딩으로 되어 있으므로 수정이 필요합니다.

또한, 해당 함수는 형태소분석기로 추출된 단어가 사고유형, 장소, 시간 등에 따라 분류가 되는 코드입니다.

만일 추출된 단어별로 재 분류가 필요 없다면 _ morph 함수를 수정해야 합니다.

# 사용자 사전 추가 및 형태소 분석 in corab

class Corab_morph():

def __init__(self, df_path, ner_dic_path, user_dic_path, text_col = None, encoding = None):

self.encoding = encoding

self.df = self._read_file(df_path,encoding=self.encoding)

self.ner_dic = self._read_file(ner_dic_path, sheet_name='전체')

self.user_dic = self._read_file(user_dic_path, sheet_name='전체')

self.text_col = text_col

self.tokenizer = Mecab()

def _read_file(self, path, sheet_name=None, encoding = None):

if '.csv' in path:

file = pd.read_csv(path, encoding = encoding)

return file

if '.xlsx' in path or '.xls' in path:

file = pd.read_excel(path)

return file

else:

raise ImportError

def _get_jongsung_TF(self, sample_text):

sample_text_list = list(sample_text)

last_word = sample_text_list[-1]

last_word_jamo_list = list(j2hcj(h2j(last_word)))

last_jamo = last_word_jamo_list[-1]

jongsung_TF = "T"

if last_jamo in ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ,ㅔ', 'ㅟ', 'ㅖ', 'ㅒ']:

jongsung_TF = "F"

return jongsung_TF

# 사용자 사전 추가

def _add_dic(self, word_list:list):

with open("/tmp/mecab-ko-dic-2.1.1-20180720/user-dic/nnp.csv", 'r', encoding='utf-8') as f:

file_data = f.readlines()

# 단어추가

for word in word_list:

jongsung_TF = self._get_jongsung_TF(word)

line = '{},,,,NNP,*,{},{},*,*,*,*,*\n'.format(word, jongsung_TF, word)

file_data.append(line)

# 덮어쓰기

with open('/tmp/mecab-ko-dic-2.1.1-20180720/user-dic/nnp.csv','w', encoding='utf-8') as f:

for line in file_data:

print(line)

f.write(line)

def _replace_words(self, text):

# word[0] : 본문내 단어, word[1] : 변경한 단어 e.g 제 주 -> 제주

for idx, words in self.user_dic.iterrows():

if len(words[0]) != 1:

try:

text = text.replace(words[0], words[1])

except:

pass

else: # 한 글자는 변경x

pass

return text

# 형태소 분석

def _morph(self, df, tokenizer):

for idx,sent in tqdm(enumerate(df['정제_평가소견내용'])):

spot_cls = []

spot_word = []

types_cls = []

types_word = []

cause_cls = []

cause_word = []

times_cls = []

times_word = []

dangers_cls = []

dangers_word = []

mecab_sent = tokenizer.morphs(sent)

for word in mecab_sent: # 형태소 분리된 문장 sent

if word in self.ner_dic['단어'].to_list():

ner = self.ner_dic[self.ner_dic['단어'] == word]['개체명'].to_list()[0]

cls = self.ner_dic[self.ner_dic['단어'] == word]['분류'].to_list()[0]

if ner == '사고유형':

if word not in types_word: # 중복 없애기 위함

types_cls.append(cls)

types_word.append(word)

elif ner == '장소':

if word not in spot_word:

spot_cls.append(cls)

spot_word.append(word)

# print('spot',spot)

elif ner == '위해품목':

if word not in dangers_word:

dangers_cls.append(cls)

dangers_word.append(word)

# print('cause',cause)

elif ner == '사고원인':

if word not in cause_word:

cause_cls.append(cls)

cause_word.append(word)

elif ner == '시간':

if word not in times_word:

times_cls.append(cls)

times_word.append(word)

# 위 세가지 경우말고는 다른건 제외

else:

pass

# 단어가 사전에 없은 경우

else:

continue

# df에 넣기

df.loc[idx,'types'] = ",".join(types_cls)

df.loc[idx,'types_detail'] = ",".join(types_word)

df.loc[idx,'spot'] = ",".join(spot_cls)

df.loc[idx,'spot_detail'] = ",".join(spot_word)

df.loc[idx,'cause']= ",".join(cause_cls)

df.loc[idx,'cause_detail']= ",".join(cause_word)

df.loc[idx,'object']= ",".join(dangers_cls)

df.loc[idx,'object_detail']= ",".join(dangers_word)

df.loc[idx,'time']= ",".join(times_cls)

df.loc[idx,'time_detail']= ",".join(times_word)

return df

def run(self, word_rep = True):

# 개체명 단어 추가

self._add_dic(self.ner_dic['단어'].to_list())

# 업데이트

!bash /tmp/mecab-ko-dic-2.1.1-20180720/tools/add-userdic.sh # 해당코드를 실행시키면 자기가 관리하고 있는 모든 유저 딕셔너리를 실행, 업데이트

!cd /tmp/mecab-ko-dic-2.1.1-20180720

!make install

# 데이터 정제

if word_rep:

self.df[f'정제_{self.text_col}'] = self.df[self.text_col].apply(lambda x: self._replace_words(x))

# 형태소 분석

df2 = self._morph(self.df, self.tokenizer)

return df2

if __name__ == "__main__":

# Mecab 설치

!apt-get update

!apt-get install g++ openjdk-8-jdk

!pip3 install konlpy JPype1-py3

!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

# mecab-python의 버전 오류로 인해 아래 패키지를 설치하면 코랩에서 Mecab을 사용가능

!pip install mecab-python3

# 패키지 import

from konlpy.tag import Mecab

import pandas as pd

from tqdm import tqdm

try:

from jamo import h2j, j2hcj

except:

! pip install jamo

from jamo import h2j, j2hcj

files = Corab_morph(

df_path='경로1',\

ner_dic_path='경로2',\

user_dic_path='경로3',\

text_col = '평가소견내용' ,\

encoding='cp949')

df = files.run(word_rep=False)

'데이터분석' 카테고리의 다른 글

공간분석 프로젝트_함수정리 (0)	2023.12.04

H_record

사용자 사전 추가된 Mecab 형태소 분석 in Corab

'데이터분석' 카테고리의 다른 글

티스토리툴바

사용자 사전 추가된 Mecab 형태소 분석 in Corab

'데이터분석' 카테고리의 다른 글

'데이터분석' Related Articles

티스토리툴바