前面我们对较真查证平台进行了抓取,本篇将对这些数据进行可视化分析。具体使用pyecharts库进行图标绘制。
echarts是一个由百度开源的数据可视化,凭借良好的交互性,精巧的图表设计,得到了众多开发者的任何,而pyecharts就是对echarts的python分装。其包含以下特性:
直接使用pip安装即可:
pip install pyecharts
import pyecharts
# 导入pyecharts 输出版本验证安装成功
print(pyecharts.__version__)
在抓取的谣言数据中,result列表示谣言的结论,其中分为真、假、疑这三种情况,并且而对于假又分为:钓鱼贴、都市传说、假新闻、旧闻重炒、伪常识、伪科学、洋葱新闻、谣言、疑似诈骗几类。我们就这二种情况的比例绘制饼图来对比数据。
从sqlite3数据库中提取真、假、疑三种情况数据,以及钓鱼贴、都市传说、假新闻、旧闻重炒、伪常识、伪科学、洋葱新闻、谣言、疑似诈骗几类数据:
import sqlite3
conn = sqlite3.connect('jiaozhen.db')
zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '真%'")
jia = conn.execute("select count(*) from yaoyan where yaoyan.result like '假%'")
yi = conn.execute("select count(*) from yaoyan where yaoyan.result like '疑%'")
zhenSum = zhen.fetchone()[0]
jiaSum = jia.fetchone()[0]
yiSum = yi.fetchone()[0]
jia1 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%钓鱼贴'")
jia2 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%都市传说'")
jia3 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%假新闻'")
jia4 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%旧闻重炒'")
jia5 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%伪常识'")
jia6 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%伪科学'")
jia7 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%洋葱新闻'")
jia8 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%谣言'")
jia9 = zhen = conn.execute("select count(*) from yaoyan where yaoyan.result like '%疑似诈骗'")
jia1Sum = jia1.fetchone()[0]
jia2Sum = jia2.fetchone()[0]
jia3Sum = jia3.fetchone()[0]
jia4Sum = jia4.fetchone()[0]
jia5Sum = jia5.fetchone()[0]
jia6Sum = jia6.fetchone()[0]
jia7Sum = jia7.fetchone()[0]
jia8Sum = jia8.fetchone()[0]
jia9Sum = jia9.fetchone()[0]
conn.close()
print('真:%d 假:%d 疑:%d'%(zhenSum,jiaSum,yiSum))
print('钓鱼贴:%d 都市传说:%d 假新闻:%d 旧闻重炒:%d 伪常识:%d 伪科学:%d 洋葱新闻:%d 谣言:%d 疑似诈骗:%d'%(jia1Sum,jia2Sum,jia3Sum,jia4Sum,jia5Sum,jia6Sum,jia7Sum,jia8Sum,jia9Sum))
我们直接从数据库中提取需要的数据,接下来就是绘制饼图了:
# 构建饼图数据
data = [('真',zhenSum),('假',jiaSum),('疑',yiSum)]
data1 = [('钓鱼贴',jia1Sum),('都市传说',jia2Sum),('假新闻',jia3Sum),('旧闻重炒',jia4Sum),('伪常识',jia5Sum),('伪科学',jia6Sum),('洋葱新闻',jia7Sum),('谣言',jia8Sum),('疑似诈骗',jia9Sum)]
from pyecharts import options as opts
from pyecharts.charts import Pie
c = (
# 创建饼图对象
Pie()
# 添加饼图数据
# 数据格式为[(key1,value1),(key2,value2)....]
.add("",data,center=['25%','50%'])
.add("",data1,center=['75%','50%'])
# 设置全局配置项,其中设置了饼图标题
.set_global_opts(title_opts=opts.TitleOpts(title='谣言分析',subtitle='基于较真查证平台',pos_left='center',pos_top=20))
)
# 由于使用jupyter notebook编写,故渲染函数为render_notebook()
# 也可以直接使用render()渲染会返回一个html页面
c.render_notebook()
我们使用jeba中文分词类库基于TF-IDF算法对谣言数据中的title列数据进行分析,并以权重来制作词云图。
# 获取数据库连接对象
conn = sqlite3.connect('jiaozhen.db')
# 执行sql语句返回结果
titles = conn.execute('select title from yaoyan;')
sentence = ''
# 将结果组成字符串
for title in titles.fetchall():
sentence = sentence + title[0] + '。'
print(sentence[:100])
import jieba.analyse
# 引入停止词,避免类似的、如果等干扰
jieba.analyse.set_stop_words("chineseStopWordsYaoyan.txt")
# 基于TF-IDE算法进行分析
keywords = jieba.analyse.extract_tags(sentence,topK=100,withWeight=True)
print(keywords)
from pyecharts.charts import WordCloud
c = (
WordCloud()
.add('',keywords)
)
c.render_notebook()