무회blog

python: 200529-python-test002ldaModel-토픽추출 본문

Python

python: 200529-python-test002ldaModel-토픽추출

최무회 2020. 5. 29. 19:27
from gensim import corpora
from gensim.models import LsiModel
from gensim.parsing.preprocessing import preprocess_string

import re

def clean_text(x):
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern,'',x)
    return x

def clean_numbers(x):
    if bool(re.search(r'\d',x)):
        x = re.sub('[0-9]{5,}','#####',x)
        x = re.sub('[0-9]{4,}','####',x)
        x = re.sub('[0-9]{3,}','###',x)
        x = re.sub('[0-9]{2,}','##',x)
    return x

def clean(x):
    x = clean_text(x)
    x = clearn_numbers(x)
    return x
        
Comments