-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordCut.py
42 lines (30 loc) · 1.19 KB
/
wordCut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import jieba
#import gensim
import re
def get_custom_stopwords(stop_words_file):
with open(stop_words_file) as sw:
stopwords = sw.read()
stopwords_list = stopwords.split('\n')
custom_stopwords_list = [i for i in stopwords_list]
return custom_stopwords_list
def cut_char(text):
return ("/".join(jieba.cut(text,cut_all=True)))
def cutData(filePath):
cutData = pd.read_csv(filePath,index_col=0)
cutData['title'] = pd.DataFrame(cutData['title'].astype(str))
cutData['title'] = cutData['title'].apply(lambda x: cut_char(x))
cutData['content'] = pd.DataFrame(cutData['content'].astype(str))
cutData['content'] = cutData['content'].apply(lambda x: cut_char(x))
cutData['combine'] = cutData['content']+'/'+70*(cutData['title']+'/')
print(cutData.head())
return cutData
if __name__ =='__main__':
jieba.load_userdict('dict.txt')
jieba.enable_parallel(2)
print("Processing: cutting train data...")
cut_Train_Data = cutData('Train/preprocessed_train_data.csv')
cut_Train_Data.to_csv('Train/preprocessed_train_data.csv')
print("Processing: cutting test data...")
cut_Test_Data = cutData('Test/Test_DataSet_P.csv')
cut_Test_Data.to_csv('Test/result.csv')