纲要:

  1. 数据格式类型
  2. 数据预处理
  3. Jieba分词
  4. TF-IDF词向量空间
  5. 分类器

 

一:数据格式类型

我再前文提到过,本项目抓取的文本共分为13个类,共有11个数据文件。

列标题分别是:

【序号,aid,标签,titlecontent,骑行技巧,赛事,心灵鸡汤,保养,评测,改装,维修,装备,游记,资讯车讯,资讯行业新闻,资讯厂家活动,摩托文化】,对应的分类下标注为1

 

二:数据预处理

数据预处理是将所有数据数据进行合并,变成xy,这里的x是指得文本内容,y是标签,分别用英文来表示,如saishi(代表赛事),鸡汤(代表鸡汤)。具体代码如下:

# 加载通用词列表

def get_stopwords_list():
stopwords_path = ‘./file/stopword.txt’
stopwords_list = [sw.strip() for sw in open(stopwords_path, encoding=’utf-8′).readlines()]
return stopwords_list

def preprocess_text(content_lines, stopwords, bunch, category):
for line in content_lines:
try:
segs = jieba.lcut(line)
segs = filter(lambda x: len(x) > 1, segs)
segs = filter(lambda x: x not in stopwords, segs)
segs = ‘ ‘.join(segs)
# exact = jieba.analyse.extract_tags(” “.join(segs), topK=20)
# if len(exact) != 0:
# .append((‘ ‘.join(exact), category))
if len(segs) != 0:
bunch.contents.append(segs)
bunch.label.append(category)
except Exception:
continue

def process():

df0 = pd.read_csv(‘./file/moto.csv’, encoding=’utf-8′)
df0 = df0.drop(df0.columns[[0, 1, 2, 3, 5, 6, 7, 21, 22, 23, 24]], axis=1)

df1 = pd.read_excel(‘./file/1-1000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df1.to_csv(‘./file/df1_to_test.csv’, encoding=’utf-8′)
df2 = pd.read_excel(‘./file/1001-2000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df3 = pd.read_excel(‘./file/2001-3000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df4 = pd.read_excel(‘./file/3001-4000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df5 = pd.read_excel(‘./file/4001-5000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df6 = pd.read_excel(‘./file/5001-6000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df7 = pd.read_excel(‘./file/6001-7000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df8 = pd.read_excel(‘./file/7001-8000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df9 = pd.read_excel(‘./file/8001-9000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df10 = pd.read_excel(‘./file/9001-10000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
# concat
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10])
# change columns
df.columns = [‘content’, ‘saishi’, ‘jitang’, ‘baoyang’, ‘pingce’, ‘gaizhuang’,
‘weixiu’, ‘zhuangbei’, ‘youji’, ‘zixun_chexun’, ‘zixun_hangye’,
‘zixun_changjia’, ‘jiqiao’, ‘wenhua’]
df = pd.concat([df0, df])

# drop content null
df = df.dropna(subset=[‘content’])
# drop label null
df = df.dropna(subset=[‘saishi’, ‘jitang’, ‘baoyang’, ‘pingce’, ‘gaizhuang’, ‘weixiu’, ‘zhuangbei’, ‘youji’,
‘zixun_chexun’, ‘zixun_hangye’, ‘zixun_changjia’, ‘jiqiao’, ‘wenhua’],
how=’all’).reset_index(drop=True)

r1 = ‘[a-zA-Z0-9’!”#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+’
df[‘content’] = df[‘content’].astype(str).apply(lambda x: x.replace(‘\r’, ”).replace(‘\n’, ”))
df[‘content’] = df[‘content’].astype(str).apply(lambda x: re.sub(r1, ”, x))
df_saishi = df[[‘content’, ‘saishi’]].dropna()
df_jitang = df[[‘content’, ‘jitang’]].dropna()
df_baoyang = df[[‘content’, ‘baoyang’]].dropna()
df_pingce = df[[‘content’, ‘pingce’]].dropna()
df_gaizhuang = df[[‘content’, ‘gaizhuang’]].dropna()
df_weixiu = df[[‘content’, ‘weixiu’]].dropna()
df_zhuangbei = df[[‘content’, ‘zhuangbei’]].dropna()
df_youji = df[[‘content’, ‘youji’]].dropna()
df_zixun_chexun = df[[‘content’, ‘zixun_chexun’]].dropna()
df_zixun_hangye = df[[‘content’, ‘zixun_hangye’]].dropna()
df_zixun_changjia = df[[‘content’, ‘zixun_changjia’]].dropna()
df_jiqiao = df[[‘content’, ‘jiqiao’]].dropna()
df_wenhua = df[[‘content’, ‘wenhua’]].dropna()

stopwords = get_stopwords_list()
# change dataframe to list
saishi = df_saishi.content.values.tolist()
jitang = df_jitang.content.values.tolist()
baoyang = df_baoyang.content.values.tolist()
pingce = df_pingce.content.values.tolist()
gaizhuang = df_gaizhuang.content.values.tolist()
weixiu = df_weixiu.content.values.tolist()
zhuangbei = df_zhuangbei.content.values.tolist()
youji = df_youji.content.values.tolist()
zixun_chexun = df_zixun_chexun.content.values.tolist()
zixun_hangye = df_zixun_hangye.content.values.tolist()
zixun_changjia = df_zixun_changjia.content.values.tolist()
jiqiao = df_jiqiao.content.values.tolist()
wenhua = df_wenhua.content.values.tolist()

bunch = Bunch(contents=[], label=[])

preprocess_text(saishi, stopwords, bunch, ‘saishi’)
preprocess_text(jitang, stopwords, bunch, ‘jitang’)
preprocess_text(baoyang, stopwords, bunch, ‘baoyang’)
preprocess_text(pingce, stopwords, bunch, ‘pingce’)
preprocess_text(gaizhuang, stopwords, bunch, ‘gaizhuang’)
preprocess_text(weixiu, stopwords, bunch, ‘weixiu’)
preprocess_text(zhuangbei, stopwords, bunch, ‘zhuangbei’)
preprocess_text(youji, stopwords, bunch, ‘youji’)
preprocess_text(zixun_chexun, stopwords, bunch, ‘zixun_chexun’)
preprocess_text(zixun_hangye, stopwords, bunch, ‘zixun_hangye’)
preprocess_text(zixun_changjia, stopwords, bunch, ‘zixun_changjia’)
preprocess_text(jiqiao, stopwords, bunch, ‘jiqiao’)
preprocess_text(wenhua, stopwords, bunch, ‘wenhua’)

data = pd.DataFrame(columns=[‘content’, ‘category’])
data[‘content’], data[‘category’] = bunch.contents, bunch.label
data.to_csv(‘./file/data.csv’, encoding=’utf_8_sig’)

#Get train_data and test_data
train_bunch = Bunch(contents=[], label=[])
test_bunch = Bunch(contents=[], label=[])

train_bunch.contents, test_bunch.contents, train_bunch.label, test_bunch.label = train_test_split(bunch.contents,bunch.label,test_size=0.2)

with open(‘./file/train_bunch.dat’, “wb”) as fp:

pickle.dump(train_bunch, fp)

with open(‘./file/test_bunch.dat’, “wb”) as fp:

pickle.dump(test_bunch, fp)

return

 

我们一段一段来看,在process()这个程序中,首先我们读取了所有的文件,相应代码如下:

df0 = pd.read_csv(‘./file/moto.csv’, encoding=’utf-8′)
df0 = df0.drop(df0.columns[[0, 1, 2, 3, 5, 6, 7, 21, 22, 23, 24]], axis=1)

df1 = pd.read_excel(‘./file/1-1000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df2 = pd.read_excel(‘./file/1001-2000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df3 = pd.read_excel(‘./file/2001-3000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df4 = pd.read_excel(‘./file/3001-4000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df5 = pd.read_excel(‘./file/4001-5000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df6 = pd.read_excel(‘./file/5001-6000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df7 = pd.read_excel(‘./file/6001-7000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df8 = pd.read_excel(‘./file/7001-8000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df9 = pd.read_excel(‘./file/8001-9000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]
df10 = pd.read_excel(‘./file/9001-10000.xlsx’, encoding=’utf-8′).iloc[1:, 4:18]

因为df0是我第一次跑的数据,这次我依然将他都进来作为我的数据,由于跟后期数据表格的格式有点不一致,所以选取的列也不一致,大家可以忽略。

df1—df10是我这次新增的数据,表格的列如上文所说,读取的时候,将所有的行,以及内容一列,标签13列全部读进来。继而通过下面的句子将数据连起来。并修改相应的列名。

# concat
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10])

# change columns
df.columns = [‘content’, ‘saishi’, ‘jitang’, ‘baoyang’, ‘pingce’, ‘gaizhuang’, ‘weixiu’, ‘zhuangbei’, ‘youji’, ‘zixun_chexun’, ‘zixun_hangye’, ‘zixun_changjia’, ‘jiqiao’, ‘wenhua’]

最后将df0也加进来,df0前期的列名称如上述改动完成后的列名,因此我直接将他们连接起来,最终组成一个新的Pd, 取名df。

df = pd.concat([df0, df])

 

接下来我将对一些缺失的数据进行丢弃,为什么会有缺失数据?是因为有些文章全部由图片,视频组成,或者只有几个杂乱的字母,符号等,根本没有文本内容,或有写标签栏是空白(坑爹的数据提供者!!之前这里被卡过,后来发现有一部分数据真的是没有标签,没有标签,没有标签),具体代码如下:

# drop content null
df = df.dropna(subset=[‘content’])
# drop label null
df = df.dropna(subset=[‘saishi’, ‘jitang’, ‘baoyang’, ‘pingce’, ‘gaizhuang’, ‘weixiu’, ‘zhuangbei’, ‘youji’,
‘zixun_chexun’, ‘zixun_hangye’, ‘zixun_changjia’, ‘jiqiao’, ‘wenhua’],
how=’all’).reset_index(drop=True)

下面代码是对文本内容进行过滤。

r1 = ‘[a-zA-Z0-9’!”#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+’
df[‘content’] = df[‘content’].astype(str).apply(lambda x: x.replace(‘\r’, ”).replace(‘\n’, ”))
df[‘content’] = df[‘content’].astype(str).apply(lambda x: re.sub(r1, ”, x))

继而将相同类的数据提取出来。代码如下:

df_saishi = df[[‘content’, ‘saishi’]].dropna()
df_jitang = df[[‘content’, ‘jitang’]].dropna()
df_baoyang = df[[‘content’, ‘baoyang’]].dropna()
df_pingce = df[[‘content’, ‘pingce’]].dropna()
df_gaizhuang = df[[‘content’, ‘gaizhuang’]].dropna()
df_weixiu = df[[‘content’, ‘weixiu’]].dropna()
df_zhuangbei = df[[‘content’, ‘zhuangbei’]].dropna()
df_youji = df[[‘content’, ‘youji’]].dropna()
df_zixun_chexun = df[[‘content’, ‘zixun_chexun’]].dropna()
df_zixun_hangye = df[[‘content’, ‘zixun_hangye’]].dropna()
df_zixun_changjia = df[[‘content’, ‘zixun_changjia’]].dropna()
df_jiqiao = df[[‘content’, ‘jiqiao’]].dropna()
df_wenhua = df[[‘content’, ‘wenhua’]].dropna()

stopwords = get_stopwords_list()
# change dataframe to list
saishi = df_saishi.content.values.tolist()
jitang = df_jitang.content.values.tolist()
baoyang = df_baoyang.content.values.tolist()
pingce = df_pingce.content.values.tolist()
gaizhuang = df_gaizhuang.content.values.tolist()
weixiu = df_weixiu.content.values.tolist()
zhuangbei = df_zhuangbei.content.values.tolist()
youji = df_youji.content.values.tolist()
zixun_chexun = df_zixun_chexun.content.values.tolist()
zixun_hangye = df_zixun_hangye.content.values.tolist()
zixun_changjia = df_zixun_changjia.content.values.tolist()
jiqiao = df_jiqiao.content.values.tolist()
wenhua = df_wenhua.content.values.tolist()

到这一步,每一类的文章的内容都已经分别提取出来,分别保存在saishi、jitang等列表中。下面,我们将数据表示成x,y的形式,x代表的是内容,y代表的是类别,这里我使用到了一个叫Bunch对象,大家可以自行去找找关于Bunch的资料,这里你只要知道我的Bunch里面创建了两个成员,分别是content,label。这里采用了一个process_text()的函数对上述saishi、jitang等进行处理。process_text()代码如下:

def preprocess_text(content_lines, stopwords, bunch, category):

for line in content_lines:

try:

segs = jieba.lcut(line)
segs = filter(lambda x: len(x) > 1, segs)
segs = filter(lambda x: x not in stopwords, segs)
segs = ‘ ‘.join(segs)
# exact = jieba.analyse.extract_tags(” “.join(segs), topK=20)
# if len(exact) != 0:
# .append((‘ ‘.join(exact), category))
if len(segs) != 0:

bunch.contents.append(segs)
bunch.label.append(category)

except Exception:

continue

可以看到,preprocess_text有四个参数,分别是content_lines, stopwords, bunch, category,content_lines对应上文的saishi,jitang等。stopwords对应的是停用词,bunch是先前定义的Bunch对象,有两个成员,bunch.content和bunch.label,category对应的是字符串变量,也就是文章的类别“saishi”, “jitang”等。上问的saishi,jitang等文本数据传进来后,先进行jieba分词,在进行过滤掉单个字,去除停用词,继而对于非空文本进行添加到bunch.content中,同时将类别添加到bunch.label中。这里有一个jiaba分词,需要大家注意一下,下面我将针对jieba分词简单讲述一下。

三:jiaba分词

未完待续……

吐槽一下,wordpress写这类博客真的是先当不方便,前期写机器学习算法的时候各种公式不能敲,现在加载代码的时候各种不能识别缩进,也不能使用Tab,关键字体也很难改,据说有相应地插件,折腾了好久,真是为后面的text-cnn揪心。再这么折腾下去要回归第三方博客了……

分类: Machine-Learning

发表评论

电子邮件地址不会被公开。 必填项已用*标注