무회blog

python: 200805-keras001.py, 파이참 모듈에서 테스트 본문

Python

python: 200805-keras001.py, 파이참 모듈에서 테스트

최무회 2020. 8. 5. 20:13

 

"""
使用 PyTorch-Transformers 模型库,先设置好准备输入模型的例子,
使用 BertTokenizer() 建立分词器对象对原句进行分词,然后对照词表将词转换成序号。
"""


# from nltk import sent_tokenize, word_tokenize

from Cleaning_Text import Cleaning_Text
import numpy as np

import os, sys
import pandas as pd

print('success')

## 데이타 읽어 드리고 , 변환 하기
# df = pd.read_excel('./test_data_pytorch/test_allData_5000-2.xlsx')
read_path='./../../04-srcTest/test_data/test_allData_5000.xlsx'
print(read_path)

df=pd.read_excel(read_path)
df = df[['subMenu', 'content']].sample(1000)
df2 = df.copy()
type(df2.subMenu.values)
label_name = set(df2.subMenu.values)
label_name = sorted(list(label_name))
print(label_name)

labels = []
subMenu = df.subMenu.tolist()
for i, j in enumerate(subMenu):
    for k, s in enumerate(label_name):
        if j == s:
            labels.append(str(k))
            k += 1

print(labels)
print(len(labels))

## 라벨 적용
df2['labels'] = labels
df2 = df2[['labels', 'subMenu', 'content']]
print(df2)

type(df2.content.values)
datas = df2.content.values
len(datas)

data = []
for i in datas:
    ts = i
    ts = Cleaning_Text.text_cleaning(ts)
    ts = ts.split('\n')
    data.append(ts)

#  content 데이터에 [SEP] 적용
dt = []
for i, j in enumerate(data):
    dts = []
    for k in j:
        ts = k
        #         ts =  ts + ' [SEP] '
        dts.append(ts)
    dt.append(dts)

print('dt',dt)

#  content 데이터에 [CLS] 적용
dtt = dt
# dtt = ['[CLS]' + Cleaning_Text.listToText(dt[x]) for x in range(len(dt))]
dtt[2]
df2['cts'] = dtt
df2 = df2[['labels', 'subMenu', 'cts']]
df3 = df2.copy()
# df3_cts = [df2.cts.values[s][:-7] for s in range(len(df2))]
df3_cts = df2.cts.tolist()
df3['cts'] = df3_cts

print(df3.head(3).cts.values)


txt = [Cleaning_Text.listToText(x) for x in df3.cts.values]
df3['cts'] = txt


df = df3[:700]               #  700 , train_df
df_dev = df3[700:900]            #  200 , test_df

# 데이터를 label 과 컨텐츠로 초기화 후 내보낸다 .
def read_txt_data(df):
    texts = df.cts.tolist()
    labels = df.labels.tolist()
    return labels, texts

train_labels, texts = read_txt_data(df)
train_df = pd.DataFrame({'label': train_labels, 'text': texts})

test_labels, texts = read_txt_data(df_dev)
test_df = pd.DataFrame({'label': test_labels, 'text': texts})

print(train_df.head())
print(test_df.head())
train_df['text_len'] = train_df['text'].apply(lambda x: len(x))

# ## 훈련용 데이터 의 summary , 75%    1221 글자 , 768 차원으로 지정 예정
print(train_df.describe())
# train_df.head()


labels
sentencses=['[CLS] ' + sent + ' [SEP]' for sent in train_df.text.values]



print("라벨:",labels)
print("첫구절:",sentencses[0])

# tokenized_sents=[tokenizer.tokenize(sent) for sent in sentencses]
# print("tokenized첫구절:",tokenized_sents[0])

# ------------------------------------------------------------------------------------------
import os
# 是否使用GPU训练
# os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7,8"

import numpy as np
# from load_data import train_df, test_df
from keras.utils import to_categorical
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, BatchNormalization, Dense
from people_relation_extract.bert.extract_feature import BertVector


print('pass01')


max_seq_len=100

# 读取文件并进行转换
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100)


print(bert_model)
#
# print('begin encoding')
# f = lambda text: bert_model.encode([text])["encodes"][0]
# train_df['x'] = train_df['text'].apply(f)
# test_df['x'] = test_df['text'].apply(f)
# print('end encoding')
#
# x_train = np.array([vec for vec in train_df['x']])
# x_test = np.array([vec for vec in test_df['x']])
# y_train = np.array([vec for vec in train_df['label']])
# y_test = np.array([vec for vec in test_df['label']])
# print('x_train: ', x_train.shape)
#
# # Convert class vectors to binary class matrices.
# num_classes = 2
# y_train = to_categorical(y_train, num_classes)
# y_test = to_categorical(y_test, num_classes)
#
# # 创建DNN模型
# x_in = Input(shape=(768, ))
# x_out = Dense(32, activation="relu")(x_in)
# x_out = BatchNormalization()(x_out)
# x_out = Dense(num_classes, activation="softmax")(x_out)
# model = Model(inputs=x_in, outputs=x_out)
# print(model.summary())
#
# model.compile(loss='categorical_crossentropy',
#               optimizer=Adam(),
#               metrics=['accuracy'])
#
# # 模型训练、评估以及保存
# model.fit(x_train, y_train, batch_size=8, epochs=20)
# model.save('visit_classify.h5')
# print(model.evaluate(x_test, y_test))

 

 

참고링크:

https://www.cnblogs.com/jclian91/p/12301056.html

 

NLP(二十)利用BERT实现文本二分类 - 山阴少年 - 博客园

NLP(二十)利用BERT实现文本二分类

www.cnblogs.com

https://github.com/google-research/bert/blob/master/extract_features.py

 

google-research/bert

TensorFlow code and pre-trained models for BERT. Contribute to google-research/bert development by creating an account on GitHub.

github.com

 

https://github.com/google-research/bert

 

google-research/bert

TensorFlow code and pre-trained models for BERT. Contribute to google-research/bert development by creating an account on GitHub.

github.com

 

https://blog.csdn.net/ling620/article/details/97789853

 

BERT 提取特征 (extract_features.py) 源码分析 代码简化_ling620的专栏-CSDN博客_extract_features.py

版权声明:本文为博主原创文章,转载请注明出处:https://blog.csdn.net/ling620/article/details/97789853 之前的文章介绍了如何使用Bert的extract_features.py去提取特征向量,本文对源码进一步的分析。 BERT之��

blog.csdn.net

 

Comments