무회blog

QnA 테스트 해보기 , 랜덤 값 , 분류 , 본문

Python

QnA 테스트 해보기 , 랜덤 값 , 분류 ,

최무회 2020. 8. 27. 19:00

 

import re
class Cleaning_Text:
    def comMonText_biyur(original_tokenList, gensim_tokenList1):
        _topFreqOriginal = Counter(original_tokenList)
        _topFreqGensim1 = Counter(gensim_tokenList1)
        ### 교집합
        comMon1_1 = _topFreqOriginal & _topFreqGensim1
        # 교집합 빈도수
        most_cnt = 50  ## 상위 50개
        comMon1_1 = comMon1_1.most_common(most_cnt)  # _topFreqGensim
        if len(original_tokenList) == 0:
            biyur = 0
        else:
            biyur1_1 = round(len(comMon1_1) / len(original_tokenList), 2) * 100
            biyur1_1 = int(biyur1_1)
        biyur1_1 = str(biyur1_1) + '%'
        return biyur1_1

    def tokenize_kiwi(sent):
        res, score = kiwi.analyze(sent)[0]  # 첫번째 결과를 사용
        return [word
                for word, tag, _, _ in res
                if not tag.startswith('E')
                and not tag.startswith('J')
                and not tag.startswith('S')]  # 조사, 어미, 특수기호는 제거

    def token_word(documents):  ## param List
        texts = [
            [word for word in document.split() if len(word) > 1]
            for document in documents
        ]
        return texts

    def token_sent(text):
        text = sent_tokenize(text)
        if len(text) > 1:
            text = text
        else:
            pass
        text_token = text
        return text_token

    def listToText(inputList):
        returnText = ''
        for i in inputList:
            returnText = returnText + i
        rt2 = Cleaning_Text.text_cleaning(returnText)
        return rt2

    ##  # step_4,   공통 코드 , 텍스트 클리닝
    def text_cleaning(text):
        ##################################  gensim 사용을 위한 정규표현식 200624
        hangul_path9 = '[가-힣]+\.'  # 한글로 포함되다 .

        hangul_path0 = '[가-힣]+\.[가-힣]{1}'  # 한글 . + 한글 처리
        hangul_path1 = '[가-힣]+\.[\d]{1}'  # 한글 . + 숫자 처리 [0-9]
        hangul_path2 = '[가-힣]+\.[a-z]{1}'  # 한글 . + 영어 소문자
        hangul_path3 = '[가-힣]+\.[A-Z]{1}'  # 한글 . + 영어 대문자
        hangul_path4 = '[가-힣]+\.[\S]{1}'  # 한글 . + 비공백 [^ \t\n\r\f\v]와 같다.
        hangul_path5 = '[가-힣]+\.[\s]{1}'  # 한글 . + 공백 [ \t\n\r\f\v]와 같다.
        hangul_path6 = '[가-힣]+\.[\W]{1}'  # 한글 . + 숫자 + 문자가 아닌 것 [^a-zA-Z0-9]와 같다.
        hangul_path7 = '[가-힣]+\.[\w]{1}'  # 한글 . + 숫자 + 문자 [a-zA-Z0-9]와 같다.
        hangul_path8 = '[가-힣]+\.[\b]{1}'  # 한글 . + 단어 경계 (`\w`와 `\W`의 경계)

        reg_path = hangul_path0 + '|' + hangul_path1 + '|' + hangul_path2 + '|' + hangul_path3 + '|' + hangul_path4 + '|' + hangul_path5
        hangul = re.compile(reg_path)  # 한글 + 모모로 포함되다 .

        result = hangul.findall(text)  # 정규식에 일치되는 부분을 리스트 형태로 저장, 단어 반환
        result = list(set(result))
        for x in result:
            text = text.replace(x, x[:-1] + '\n' + x[-1:])

        ### 줄단위 좌우 공백 지우기 ,
        text = text.replace('\n', '_').split('_')
        text = [x.strip() for x in text]
        tts = ''
        for i in text:
            tts = tts + i + '\n'
            text = tts
            ##################################  gensim 사용을 위한 정규표현식 200624
        text = re.sub('\[.+?\]', '', text)  # 대괄호 [] 이내 모든 문자 삭제

        # 이모티콘 제거
        EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
        text = EMOJI.sub(r'', text)
        # 이메일 주소 제거
        email = re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
        text = email.sub('', text)
        # URL 제거
        url = re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
        text = url.sub('', text)
        # HTML 제거
        html = re.compile('<[^>]*>')
        text = html.sub('', text)

        # 특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
        # special =re.compile('[^\w\s]')
        # text = special.sub(' ', text)
        special = ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’', '·',
                   '=', ';', '>', '>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
                   '|', '@', '@', '©', 'ⓒ', '℗', '®', '①', '-', '▶', '…', '☞', '▲', '◆', '■']  # '.', 빼고
        for chr in special:
            text = text.replace(chr, ' ')

            # 특수문자 제거 후 생기는 중복된 공백 제거
            while text.find('  ') > 0:
                text = text.replace('  ', ' ')  # 중복된 공백 제거

            # 특수문자 제거 후 생기는 중복된 개행 제거
            while text.find('\n\n') > 0:
                text = text.replace('\n\n', '\n')  # 중복된 개행 제거

            # 좌우측 공백 삭제
            text = text.strip()
        return text

    # 텍스트 정제 함수 : 분석에 불필요한 문자는 전부 제거합니다.


# 형태소 분석해서 용언(P), 체언(N)만 남김
def hannanum_get_infoText(read_text):
    # resList = list()
    resList = []
    # GetWordSet = set(['N'])
    GetWordSet = set(['N', 'P'])
    for read_text_line in read_text:
        res = ""
        if len(read_text_line) > 0:
            pos = hannanum.pos(read_text_line, ntags=9)
            for keyword, type in pos:
                # 키워드가 한글자 이상일 경우만
                if len(keyword) > 1:
                    # 용언(P), 체언(N)만 남김
                    if (type in GetWordSet):
                        if type == 'P':  # 용언일 경우 '다'를 붙여줌
                            keyword = keyword + '다'
                        resList.append(keyword)
    return resList    
from Cleaning_Tx import Cleaning_Text
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim import models
from collections import defaultdict
from gensim import corpora
from gensim import similarities
from gensim.summarization import summarize

import pandas as pd
import time, timeit, os, sys , re , math

from nltk import sent_tokenize, word_tokenize
from konlpy.tag import Hannanum
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.prepare()
hannanum = Hannanum()
from rank_bm25 import BM25Okapi
from datetime import datetime

# BM25 를 사용하여 코사인 유사도에서 측정된 단어의 문장을 뽑아 낸다.
start_now = int(time.time())
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
## 코사인 유사도의 단어로 BM25 방식을 이용하여 유사도 텍스트 문장을 추출한다.
### from gensimBM25Source
import math
from six import iteritems
from six.moves import xrange
# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores

###  가중치를 구해준다 .
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)
    return weights


print('success')
# %matplotlib inline
import pandas as pd
import matplotlib as mpl
import numpy as np
import re


# 필요한 패키지와 라이브러리를 가져옴
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False
print('success01')
import clusterLib as ss

###########################################################################
## ss 에 대한 컨벌팅
plt = ss.plt
pd = ss.pd
Cleaning_Text = ss.Cleaning_Text
word_tokenize = ss.word_tokenize

###########################################################################

dfs = pd.read_excel('./testdd001.xlsx')

df = dfs.head(12000).copy()
df_ = dfs[12000:].copy()

print("在 subMenu 列中总共有 %d 个空值." % df['subMenu'].isnull().sum())
print("在 content 列中总共有 %d 个空值." % df['content'].isnull().sum())
df[df.isnull().values==True]
df = df[pd.notnull(df['content'])]
# print(df.head())
#  接下来我们统计一下各个类别的数据量
d = {'subMenu':df['subMenu'].value_counts().index, 'count': df['subMenu'].value_counts()}
df_cat = pd.DataFrame(data=d).reset_index(drop=True)
# print(df_cat)

# 接下来我们要将 subMenu 类转换成id,这样便于以后的分类模型的训练。
df['subMenu_id'] = df['subMenu'].factorize()[0]
cat_id_df = df[['subMenu', 'subMenu_id']].drop_duplicates().sort_values('subMenu_id').reset_index(drop=True)
cat_to_id = dict(cat_id_df.values)
id_to_cat = dict(cat_id_df[['subMenu_id', 'subMenu']].values)

cat_id_df.values
id_to_cat

print(cat_id_df)
# df['clean_content'] = df['content'].apply(Cleaning_Text.text_cleaning)
# df = df[['subMenu_id','subMenu','content','clean_content']]
print(df.sample(5))

# 分词 ,, 문장을 토큰화 ,
df['cut_content'] = df['content'].apply(lambda x: " ".join(w for w in word_tokenize(x)))
print(df.head())

# 这里我们使用了参数ngram_range=(1,2)
# ,这表示我们除了抽取评论中的每个词语外
# ,还要抽取每个词相邻的词并组成一个“词语对”,如: 词1,词2,词3,词4,(词1,词2),(词2,词3),(词3,词4)。
# 这样就扩展了我们特征集的数量,有了丰富的特征集才有可能提高我们分类文本的准确度。
# 参数norm='l2',是一种数据标准划处理的方式,可以将数据限制在一点的范围内比如说(-1,1)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
cut_content = df.cut_content.tolist()
# print(cut_content[0], end=',')

features = tfidf.fit_transform(cut_content)
labels = df.subMenu_id
print(features.shape)
print('-(문장중, 토픽개수)----------------------------')
print(features)

# 卡方检验的方法来找出每个分类中关联度最大的两个词语和两个词语对
# 卡方检验是一种统计学的工具,用来检验数据的拟合度和关联度。
# 在这里我们使用sklearn中的chi2方法。
# 我们可以看到经过卡方(chi2)检验后,找出了每个分类中关联度最强的两个词和两个词语对。这些词和词语对能很好的反映出分类的主题。
from sklearn.feature_selection import chi2
import numpy as np

cat_to_id.items()

N = 2
for cat, cat_id in sorted(cat_to_id.items()):
    features_chi2 = chi2(features, labels == cat_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    print(feature_names)
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(cat))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))  # 마지막 2번째 2개
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['cut_content'], df['subMenu_id'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)


## 학습이 끝나고 나서 이를 계기로 content 내용의 유형을 예측하게 한다 .
## 이를 위해 먼저 예측할 함수를 정의한다 .

# df['cut_content'] = df['clean_content'].apply(lambda x: " ".join(w for w in word_tokenize(x)))

def myPredict(text):
    format_sec=" ".join(w for w in word_tokenize(text))
    print(format_sec)
    pred_cat_id=clf.predict(count_vect.transform([format_sec]))
    predic = id_to_cat[pred_cat_id[0]]
    print('예측결과 : {}'.format(predic))

# while:
myPredict("블로그에 글 올리면 뭐 주시는 거예요?")

# for i in range(3):
#     myPredict(input("QnA를 입력해보세요 : "))



#############################
## 接下来我们尝试不同的机器学习模型,并评估它们的准确率,我们将使用如下四种模型:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
# #############################
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df,
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
# #############################



## 상기 내용중 linearSVC 의 정확도가 가장 높은것으로 판단됨
fenshu = cv_df.groupby('model_name').accuracy.mean()

print("종합평균점수: ", fenshu.sort_values(ascending=False))
print(type(fenshu))

############################


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# 训练模型
model = LogisticRegression()
X_train, X_test, y_train, y_test\
    , indices_train\
    , indices_test = train_test_split(features
                                      , labels
                                      , df.index
                                      , test_size=0.33
                                      , stratify=labels
                                      , random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# # # 生成混淆矩阵
# # conf_mat = confusion_matrix(y_test, y_pred)
# # fig, ax = plt.subplots(figsize=(10, 8))
# # sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=cat_id_df.subMenu.values, yticklabels=cat_id_df.subMenu.values)
# # plt.ylabel('shijiJIEGUO', fontsize=18)
# # plt.xlabel('yuceJIEGUO', fontsize=18)
# # plt.show()

# from sklearn.metrics import classification_report
# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred, target_names=cat_id_df['subMenu'].values))

# df['clean_content'] = df['content'].apply(Cleaning_Text.text_cleaning)

# df_ = df_['content'].tolist()

test = df_.content.tolist()

print(type(test))
print(test)

for x in test:
    myPredict(x)


myPredict(input("QnA 예측해보기: "))
Comments