公众号:尤而小屋 作者:Peter 编辑:Peter
大家好,我是Peter~
今天给大家分享一个文本分析的实战案例:基于LDA的Twitter文本分析。
In 1:
import os
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import gensim
from gensim import corpora, models, similarities
import logging
import tempfile
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
from collections import OrderedDict
import seaborn as sns
import pyLDAvis.gensim # pip install pyLDAvis
import matplotlib.pyplot as plt
%matplotlib inline
init_notebook_mode(connected=True) # do not miss this line
import warnings
In 2:
df = pd.read_csv("data.csv",encoding="gb18030")
df.head()
Out2:
row ID | Tweet | Time | Retweet from | User | |
---|---|---|---|---|---|
0 | Row0 | @MeltingIce Assuming max acceleration of 2 to ... | 2017-09-29 17:39:19 | NaN | elonmusk |
1 | Row1 | RT @SpaceX: BFR is capable of transporting sat... | 2017-09-29 10:44:54 | SpaceX | elonmusk |
2 | Row2 | @bigajm Yup :) | 2017-09-29 10:39:57 | NaN | elonmusk |
3 | Row3 | Part 2 https://t.co/8Fvu57muhM | 2017-09-29 09:56:12 | NaN | elonmusk |
4 | Row4 | Fly to most places on Earth in under 30 mins a... | 2017-09-29 09:19:21 | NaN | elonmusk |
In 3:
df.shape # 数据量
Out3:
(3218, 5)
查看数据中的缺失值情况:
In 4:
df.isnull().sum()
Out4:
row ID 0
Tweet 0
Time 0
Retweet from 2693
User 0
dtype: int64
In 5:
df.dtypes # 转换前
Out5:
row ID object
Tweet object
Time object
Retweet from object
User object
dtype: object
时间字段的转换:
In 6:
df["Time"] = pd.to_datetime(df["Time"]) # 转换成时间格式
In 7:
df.dtypes # 转换后
Out7:
row ID object
Tweet object
Time datetime64[ns]
Retweet from object
User object
dtype: object
In 8:
df["Time"] = pd.to_datetime(df['Time'], format='%y-%m-%d %H:%M:%S')
df.head()
Out8:
row ID | Tweet | Time | Retweet from | User | |
---|---|---|---|---|---|
0 | Row0 | @MeltingIce Assuming max acceleration of 2 to ... | 2017-09-29 17:39:19 | NaN | elonmusk |
1 | Row1 | RT @SpaceX: BFR is capable of transporting sat... | 2017-09-29 10:44:54 | SpaceX | elonmusk |
2 | Row2 | @bigajm Yup :) | 2017-09-29 10:39:57 | NaN | elonmusk |
3 | Row3 | Part 2 https://t.co/8Fvu57muhM | 2017-09-29 09:56:12 | NaN | elonmusk |
4 | Row4 | Fly to most places on Earth in under 30 mins a... | 2017-09-29 09:19:21 | NaN | elonmusk |
In 9:
df.drop("row ID", axis=1, inplace=True)
In 10:
tweetsdata = df["Time"]
tweetsdata
Out10:
0 2017-09-29 17:39:19
1 2017-09-29 10:44:54
2 2017-09-29 10:39:57
3 2017-09-29 09:56:12
4 2017-09-29 09:19:21
...
3213 2012-11-20 08:52:03
3214 2012-11-20 08:38:31
3215 2012-11-20 08:30:44
3216 2012-11-19 08:59:46
3217 2012-11-16 17:59:47
Name: Time, Length: 3218, dtype: datetime64[ns]
In 11:
trace = go.Histogram( # 绘制数据
x = tweetsdata, # x 轴的数据
marker = dict(color="blue"), # 柱子颜色
opacity = 0.75 # 透明度设置
)
layout = go.Layout( # 整体布局
title = "Tweet Activity Over Years", # 标题-高-宽-xy轴标题-柱子间隔
height=450,
width=1200,
xaxis=dict(title='Month and year'),
yaxis=dict(title='Tweet Quantity'),
bargap=0.2)
data = [trace]
fig = go.Figure(data=data, layout=layout)
fig.show()
准备好语料库corpus:
In 12:
corpus = df["Tweet"].tolist()
corpus[:5]
Out12:
["@MeltingIce Assuming max acceleration of 2 to 3 g's, but in a comfortable direction. Will feel like a mild to moder? https://t.co/fpjmEgrHfC",
'RT @SpaceX: BFR is capable of transporting satellites to orbit, crew and cargo to the @Space_Station and completing missions to the Moon an?',
'@bigajm Yup :)',
'Part 2 https://t.co/8Fvu57muhM',
'Fly to most places on Earth in under 30 mins and anywhere in under 60. Cost per seat should be? https://t.co/dGYDdGttYd']
In 13:
import os
TEMP_FOLDER = os.getcwd() # 当前目录
In 14:
list1 = ['RT','rt']
stoplist = stopwords.words('english') + list(punctuation) + list1
stoplist[:10]
Out14:
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
下面的代码表示:
In 15:
texts = [[word for word in str(document).lower().split() if word not in stoplist] for document in corpus]
print(texts[0])
['@meltingice', 'assuming', 'max', 'acceleration', '2', '3', "g's,", 'comfortable', 'direction.', 'feel', 'like', 'mild', 'moder?', 'https://t.co/fpjmegrhfc']
将单词用词袋表示,并且存储在指定路径下:
In 16:
dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict'))
获取每个单词对应的id序号:
In 17:
dictionary.token2id # 获取每个单词对应的id序号
{'2': 0,
'3': 1,
'@meltingice': 2,
'acceleration': 3,
'assuming': 4,
'comfortable': 5,
'direction.': 6,
'feel': 7,
"g's,": 8,
'https://t.co/fpjmegrhfc': 9,
'like': 10,
'max': 11,
'mild': 12,
'moder?': 13,
'@space_station': 14,
......
}
生成语料corpus内容:将单词转换成词袋表示
In 18:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus[:2]
Out18:
[[(0, 1),
(1, 1),
(2, 1),
(3, 1),
(4, 1),
(5, 1),
(6, 1),
(7, 1),
(8, 1),
(9, 1),
(10, 1),
(11, 1),
(12, 1),
(13, 1)],
[(14, 1),
(15, 1),
(16, 1),
(17, 1),
(18, 1),
(19, 1),
(20, 1),
(21, 1),
(22, 1),
(23, 1),
(24, 1),
(25, 1),
(26, 1)]]
将已经序列化的语料库保存成文件,需要指定一个路径:
In 19:
# 语料库的保存
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus)
In 20:
tfidf = models.TfidfModel(corpus) # 1-模型初始化
corpus_tfidf = tfidf[corpus] # 2-基于tfidf模型将语料转成向量
In 21:
total_topics = 5 # 设置5个主题
lda = models.LdaModel(corpus, # 语料
id2word=dictionary, # 单词与序号的对应字典
num_topics=total_topics # 设置主题数
)
corpus_lda = lda[corpus_tfidf]
In 22:
lda.show_topics(total_topics, 3)
Out22:
[(0, '0.006*"..." + 0.006*"tesla" + 0.005*"model"'),
(1, '0.012*"launch" + 0.011*"falcon" + 0.009*"@spacex:"'),
(2, '0.014*"tesla" + 0.006*"model" + 0.005*"new"'),
(3, '0.011*"model" + 0.006*"good" + 0.006*"tesla"'),
(4, '0.008*"tesla" + 0.006*"model" + 0.005*"w"')]
In 23:
data_lda = {i: OrderedDict(lda.show_topic(i,25)) for i in range(total_topics)}
data_lda
Out23
{0: OrderedDict([('...', 0.006462383),
('tesla', 0.005584647),
('model', 0.0048239143),
('new', 0.004051302),
('next', 0.003930719),
('great', 0.0030215571),
('good', 0.002984404),
('miles', 0.0029328458),
('like', 0.002857408),
("i'm", 0.0027793457),
('rocket', 0.0025001287),
('back', 0.0024146684),
('@elonmusk', 0.0023003744),
('long', 0.0022880563),
('super', 0.0022213901),
('@spacex', 0.0022024196),
('flight', 0.0021213787),
......
])}
In 24:
df_lda = pd.DataFrame(data_lda)
df_lda.head()
Out24:
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
... | 0.006462 | NaN | NaN | NaN | NaN |
tesla | 0.005585 | 0.005550 | 0.014481 | 0.005585 | 0.008305 |
model | 0.004824 | 0.002016 | 0.005960 | 0.010575 | 0.006079 |
new | 0.004051 | NaN | 0.004858 | NaN | 0.001924 |
next | 0.003931 | NaN | 0.002050 | 0.004409 | NaN |
In 25:
df_lda = df_lda.fillna(0).T
df_lda
Out25:
... | tesla | model | new | next | great | good | miles | like | i'm | ... | vs | time | yeah, | software | people | 2 | yes | range | cool | yes, | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.006462 | 0.005585 | 0.004824 | 0.004051 | 0.003931 | 0.003022 | 0.002984 | 0.002933 | 0.002857 | 0.002779 | ... | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | 0.000000 | 0.005550 | 0.002016 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.002652 | 0.000000 | ... | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.014481 | 0.005960 | 0.004858 | 0.002050 | 0.000000 | 0.001897 | 0.000000 | 0.002496 | 0.000000 | ... | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
3 | 0.000000 | 0.005585 | 0.010575 | 0.000000 | 0.004409 | 0.000000 | 0.005739 | 0.000000 | 0.005094 | 0.000000 | ... | 0.001973 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
4 | 0.000000 | 0.008305 | 0.006079 | 0.001924 | 0.000000 | 0.002156 | 0.002271 | 0.000000 | 0.002187 | 0.000000 | ... | 0.000000 | 0.003843 | 0.00277 | 0.002591 | 0.002583 | 0.002147 | 0.002143 | 0.002091 | 0.002056 | 0.002036 |
5 rows × 80 columns
显示不同单词之间的相关性
In 26:
g = sns.clustermap(df_lda.corr(), # 相关系数
center = 0,
standard_scale = 1,
cmap = "RdBu",
metric = "cosine",
linewidths = 0.75,
figsize = (10,10)
)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.show()
In 27:
pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda, corpus_lda, dictionary, mds='tsne')
panel
项目地址:https://www.kaggle.com/code/errearanhas/topic-modelling-lda-on-elon-tweets/notebook
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。