무회blog

QnA 모듈분류 및 테스트 본문

Python

QnA 모듈분류 및 테스트

최무회 2020. 8. 28. 17:07

001.

from libs import *

df['cut_content'] = df['content'].apply(lambda x: " ".join(w for w in word_tokenize(str(x))))
# 这里我们使用了参数ngram_range=(1,2)
# ,这表示我们除了抽取评论中的每个词语外
# ,还要抽取每个词相邻的词并组成一个“词语对”,如: 词1,词2,词3,词4,(词1,词2),(词2,词3),(词3,词4)。
# 这样就扩展了我们特征集的数量,有了丰富的特征集才有可能提高我们分类文本的准确度。
# 参数norm='l2',是一种数据标准划处理的方式,可以将数据限制在一点的范围内比如说(-1,1)
tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
cut_contents = df.cut_content.tolist()
cut_content = [x for x in cut_contents]
features = tfidf.fit_transform(cut_content)
labels = df.subMenu_id
print('labels')

# 卡方检验的方法来找出每个分类中关联度最大的两个词语和两个词语对
# 卡方检验是一种统计学的工具,用来检验数据的拟合度和关联度。
# 在这里我们使用sklearn中的chi2方法。
# 我们可以看到经过卡方(chi2)检验后,找出了每个分类中关联度最强的两个词和两个词语对。这些词和词语对能很好的反映出分类的主题。
cat_to_id.items()
N = 2
for cat, cat_id in sorted(cat_to_id.items()):
    features_chi2 = chi2(features, labels == cat_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]

X_train, X_test, y_train, y_test = train_test_split(df['cut_content'], df['subMenu_id'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)
## 학습이 끝나고 나서 이를 계기로 content 내용의 유형을 예측하게 한다 .
## 이를 위해 먼저 예측할 함수를 정의한다.
def yuce(text):
    format_sec=" ".join(w for w in word_tokenize(text))
    pred_cat_id=clf.predict(count_vect.transform([format_sec]))
    predic = id_to_cat[pred_cat_id[0]]
    print('예측결과 : {}'.format(predic))
    QnA = predic
    text = format_sec
    output = {QnA:text}
    return output


models = [
    LogisticRegression(random_state=0, max_iter= mx_cnt),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),  # 사이즈가 많이 작을때 적용
    MultinomialNB()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


# # #############################
# sns.boxplot(x='model_name', y='accuracy', data=cv_df)
# sns.stripplot(x='model_name', y='accuracy', data=cv_df,
#               size=8, jitter=True, edgecolor="gray", linewidth=2)
# plt.show()
# # #############################
# ## 상기 내용중  의 정확도가 가장 높은것으로 판단됨
# fenshu = cv_df.groupby('model_name').accuracy.mean()
#
# print("종합평균점수: ", fenshu.sort_values(ascending=False))
# print(type(fenshu))

############################

# 训练模型
model = LogisticRegression()

X_train, X_test, y_train, y_test\
    , indices_train\
    , indices_test = train_test_split(features
                                      , labels
                                      , df.index
                                      , test_size=0.33
                                      , stratify=labels
                                      , random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# save_model(model)
# model_name = 'test001.model'
# mod = read_model(model_name)

for i in range(3):
    yuce(input("QnA 예측해보기: "))

 

002.

# 첫번째 시트를 기준으로 전부 읽기
import os
import pandas as pd
dir = 'data/'
filenames = os.listdir(dir)
index = 1
dfs = []
for name in filenames:
	# print(index)
	dfs.append(pd.read_excel(os.path.join(dir,name)))
	index += 1 #为了查看合并到第几个表格了
df = pd.concat(dfs)
df.to_excel('./total.xlsx',index = False)

print('./total.xlsx')


# from okp import *
#
# dfs = pd.read_excel('./testdd001.xlsx')
# df = dfs.head(12000).copy()
# df_ = dfs[12000:].copy()
#
# df[df.isnull().values==True]
# df = df[pd.notnull(df['content'])]
# #  接下来我们统计一下各个类别的数据量
# d = {'subMenu':df['subMenu'].value_counts().index, 'count': df['subMenu'].value_counts()}
# df_cat = pd.DataFrame(data=d).reset_index(drop=True)
#
# # 接下来我们要将 subMenu 类转换成id,这样便于以后的分类模型的训练。
# df['subMenu_id'] = df['subMenu'].factorize()[0]
# cat_id_df = df[['subMenu', 'subMenu_id']].drop_duplicates().sort_values('subMenu_id').reset_index(drop=True)
# cat_to_id = dict(cat_id_df.values)
# id_to_cat = dict(cat_id_df[['subMenu_id', 'subMenu']].values)
#
# # 分词 ,, 문장을 토큰화 ,
# df['cut_content'] = df['content'].apply(lambda x: " ".join(w for w in word_tokenize(x)))

 

003

# 첫번째 시트를 기준으로 전부 읽기
import os
import pandas as pd
df = pd.read_excel('./total.xlsx')

dfs = df.copy()
cnt = int(len(dfs)*0.8)
df_ = dfs[cnt:].copy()
df  = dfs.head(cnt).copy()
df[df.isnull().values==True]
df = df[pd.notnull(df['content'])]
#  接下来我们统计一下各个类别的数据量
d = {'subMenu':df['subMenu'].value_counts().index, 'count': df['subMenu'].value_counts()}
df_cat = pd.DataFrame(data=d).reset_index(drop=True)

# 接下来我们要将 subMenu 类转换成id,这样便于以后的分类模型的训练。
df['subMenu_id'] = df['subMenu'].factorize()[0]
cat_id_df = df[['subMenu', 'subMenu_id']].drop_duplicates().sort_values('subMenu_id').reset_index(drop=True)
cat_to_id = dict(cat_id_df.values)
id_to_cat = dict(cat_id_df[['subMenu_id', 'subMenu']].values)


 

004

## 1.利用pickle
import pickle
model_name = 'test001.model'
# 模型保存
# 这里的model已经是生成好的模型了,注意一定要用"wb+",否则会报错
def save_model(model):
    s = pickle.dumps(model)
    f = open(model_name, "wb+")
    f.write(s)
    f.close()
    print("Done\n")

def read_model(model_name):
    # 模型调用,注意读取方式'rb',否则会报错
    f2 = open(model_name, 'rb')
    s2 = f2.read()
    model1 = pickle.loads(s2)
    return model1

print('success')

 

005

import re
class Cleaning_Text:
    def comMonText_biyur(original_tokenList, gensim_tokenList1):
        _topFreqOriginal = Counter(original_tokenList)
        _topFreqGensim1 = Counter(gensim_tokenList1)
        ### 교집합
        comMon1_1 = _topFreqOriginal & _topFreqGensim1
        # 교집합 빈도수
        most_cnt = 50  ## 상위 50개
        comMon1_1 = comMon1_1.most_common(most_cnt)  # _topFreqGensim
        if len(original_tokenList) == 0:
            biyur = 0
        else:
            biyur1_1 = round(len(comMon1_1) / len(original_tokenList), 2) * 100
            biyur1_1 = int(biyur1_1)
        biyur1_1 = str(biyur1_1) + '%'
        return biyur1_1

    def tokenize_kiwi(sent):
        res, score = kiwi.analyze(sent)[0]  # 첫번째 결과를 사용
        return [word
                for word, tag, _, _ in res
                if not tag.startswith('E')
                and not tag.startswith('J')
                and not tag.startswith('S')]  # 조사, 어미, 특수기호는 제거

    def token_word(documents):  ## param List
        texts = [
            [word for word in document.split() if len(word) > 1]
            for document in documents
        ]
        return texts

    def token_sent(text):
        text = sent_tokenize(text)
        if len(text) > 1:
            text = text
        else:
            pass
        text_token = text
        return text_token

    def listToText(inputList):
        returnText = ''
        for i in inputList:
            returnText = returnText + i
        rt2 = Cleaning_Text.text_cleaning(returnText)
        return rt2

    ##  # step_4,   공통 코드 , 텍스트 클리닝
    def text_cleaning(text):
        ##################################  gensim 사용을 위한 정규표현식 200624
        hangul_path9 = '[가-힣]+\.'  # 한글로 포함되다 .

        hangul_path0 = '[가-힣]+\.[가-힣]{1}'  # 한글 . + 한글 처리
        hangul_path1 = '[가-힣]+\.[\d]{1}'  # 한글 . + 숫자 처리 [0-9]
        hangul_path2 = '[가-힣]+\.[a-z]{1}'  # 한글 . + 영어 소문자
        hangul_path3 = '[가-힣]+\.[A-Z]{1}'  # 한글 . + 영어 대문자
        hangul_path4 = '[가-힣]+\.[\S]{1}'  # 한글 . + 비공백 [^ \t\n\r\f\v]와 같다.
        hangul_path5 = '[가-힣]+\.[\s]{1}'  # 한글 . + 공백 [ \t\n\r\f\v]와 같다.
        hangul_path6 = '[가-힣]+\.[\W]{1}'  # 한글 . + 숫자 + 문자가 아닌 것 [^a-zA-Z0-9]와 같다.
        hangul_path7 = '[가-힣]+\.[\w]{1}'  # 한글 . + 숫자 + 문자 [a-zA-Z0-9]와 같다.
        hangul_path8 = '[가-힣]+\.[\b]{1}'  # 한글 . + 단어 경계 (`\w`와 `\W`의 경계)

        reg_path = hangul_path0 + '|' + hangul_path1 + '|' + hangul_path2 + '|' + hangul_path3 + '|' + hangul_path4 + '|' + hangul_path5
        hangul = re.compile(reg_path)  # 한글 + 모모로 포함되다 .

        result = hangul.findall(text)  # 정규식에 일치되는 부분을 리스트 형태로 저장, 단어 반환
        result = list(set(result))
        for x in result:
            text = text.replace(x, x[:-1] + '\n' + x[-1:])

        ### 줄단위 좌우 공백 지우기 ,
        text = text.replace('\n', '_').split('_')
        text = [x.strip() for x in text]
        tts = ''
        for i in text:
            tts = tts + i + '\n'
            text = tts
            ##################################  gensim 사용을 위한 정규표현식 200624
        text = re.sub('\[.+?\]', '', text)  # 대괄호 [] 이내 모든 문자 삭제

        # 이모티콘 제거
        EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
        text = EMOJI.sub(r'', text)
        # 이메일 주소 제거
        email = re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
        text = email.sub('', text)
        # URL 제거
        url = re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
        text = url.sub('', text)
        # HTML 제거
        html = re.compile('<[^>]*>')
        text = html.sub('', text)

        # 특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
        # special =re.compile('[^\w\s]')
        # text = special.sub(' ', text)
        special = ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’', '·',
                   '=', ';', '>', '>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
                   '|', '@', '@', '©', 'ⓒ', '℗', '®', '①', '-', '▶', '…', '☞', '▲', '◆', '■']  # '.', 빼고
        for chr in special:
            text = text.replace(chr, ' ')

            # 특수문자 제거 후 생기는 중복된 공백 제거
            while text.find('  ') > 0:
                text = text.replace('  ', ' ')  # 중복된 공백 제거

            # 특수문자 제거 후 생기는 중복된 개행 제거
            while text.find('\n\n') > 0:
                text = text.replace('\n\n', '\n')  # 중복된 개행 제거

            # 좌우측 공백 삭제
            text = text.strip()
        return text

    # 텍스트 정제 함수 : 분석에 불필요한 문자는 전부 제거합니다.


# 형태소 분석해서 용언(P), 체언(N)만 남김
def hannanum_get_infoText(read_text):
    # resList = list()
    resList = []
    # GetWordSet = set(['N'])
    GetWordSet = set(['N', 'P'])
    for read_text_line in read_text:
        res = ""
        if len(read_text_line) > 0:
            pos = hannanum.pos(read_text_line, ntags=9)
            for keyword, type in pos:
                # 키워드가 한글자 이상일 경우만
                if len(keyword) > 1:
                    # 용언(P), 체언(N)만 남김
                    if (type in GetWordSet):
                        if type == 'P':  # 용언일 경우 '다'를 붙여줌
                            keyword = keyword + '다'
                        resList.append(keyword)
    return resList    

 

006

mx_cnt = 200

007. libs

from Cleaning_Tx import Cleaning_Text
# from a_readFolderExcel import *
from b_readExcel import *
from c_readModel import *
from d_parameters import *

## 1.利用pickle
import pickle


from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.naive_bayes import MultinomialNB
# #############################
## 接下来我们尝试不同的机器学习模型,并评估它们的准确率,我们将使用如下四种模型:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim import models
from collections import defaultdict
from gensim import corpora
from gensim import similarities
from gensim.summarization import summarize


import pandas as pd
# %matplotlib inline
import matplotlib as mpl
import numpy as np
import re

import time, timeit, os, sys , re , math

from nltk import sent_tokenize, word_tokenize
from konlpy.tag import Hannanum
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.prepare()
hannanum = Hannanum()
from rank_bm25 import BM25Okapi
from datetime import datetime

# BM25 를 사용하여 코사인 유사도에서 측정된 단어의 문장을 뽑아 낸다.
start_now = int(time.time())
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
## 코사인 유사도의 단어로 BM25 방식을 이용하여 유사도 텍스트 문장을 추출한다.
### from gensimBM25Source
import math
from six import iteritems
from six.moves import xrange
# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

# 필요한 패키지와 라이브러리를 가져옴
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False
print('libs-success01')





008

. okp

from libs import *

# BM25 를 사용하여 코사인 유사도에서 측정된 단어의 문장을 뽑아 낸다.
start_now = int(time.time())
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
## 코사인 유사도의 단어로 BM25 방식을 이용하여 유사도 텍스트 문장을 추출한다.
### from gensimBM25Source
import math
from six import iteritems
from six.moves import xrange
# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores

###  가중치를 구해준다 .
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)
    return weights

print('okp_success')


#
# # sns.boxplot(x='model_name', y='accuracy', data=cv_df)
# # sns.stripplot(x='model_name', y='accuracy', data=cv_df,
# #               size=8, jitter=True, edgecolor="gray", linewidth=2)
# # plt.show()
# # #############################
#

# # # 生成混淆矩阵
# # conf_mat = confusion_matrix(y_test, y_pred)
# # fig, ax = plt.subplots(figsize=(10, 8))
# # sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=cat_id_df.subMenu.values, yticklabels=cat_id_df.subMenu.values)
# # plt.ylabel('shijiJIEGUO', fontsize=18)
# # plt.xlabel('yuceJIEGUO', fontsize=18)
# # plt.show()

# from sklearn.metrics import classification_report
# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred, target_names=cat_id_df['subMenu'].values))

total.xlsx
2.08MB

Comments