数据源为B站视频评论
在加载新的一页评论时观察到访问的url
https://api.bilibili.com/x/v2/reply/main?callback=jQuery172020167562580015508_1653393655707&jsonp=jsonp&next=4&type=1&oid=768584323&mode=3&plat=1&_=1653396013955
其中
next:页数
oid:视频av号
访问时带上head
例如{user-agent,"referer":"https://www.bilibili.com"}
可以获得评论的json文件
以av=13662970"你的名字"为例
url
=https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=%25d&type=1&oid=13662970
修改page值爬取10页json(最后修改获取100页100个json)
读取10个json文件获取评论
data["data"]["replies"][i=0-19]["content"]["message"]
存在很多无意义词
创建ignore_dict.txt,根据结果排除词
词云,排除词
file_path="res.csv"
df=pd.read_csv(file_path,encoding="gbk")
df.columns = ["word","frequency"]
print(df.head())
plt.bar(df["word"],df["frequency"])
plt.xlabel('词')
plt.ylabel('出现次数')
plt.show()
like列表与用户等级列表
like.append(data["data"]["replies"][i]["like"])
level.append(data["data"]["replies"][i]["member"]["level_info"]["current_level"])
绘图
plt.bar(level,like)
plt.xlabel('用户等级')
plt.ylabel('点赞总数')
使用的库:requests
,jieba
,re
,pandas
,matplotlib
,json
,csv
过程
from_bilibili_get_json.py---获取评论json
like_level.py---根据json得到不同等级用户的被点赞量并绘制柱状图
dejson.py---收集所有评论到comment.txt
词频.py-词频分析
keshijua.py---可视化
from_bilibili_get_json.py
import requests,json
headerbili={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36
Edg/100.0.1185.50",
"referer":"https://www.bilibili.com"
}
'''
next:页数
oid:视频av号
'''python
def get_json(page):
url="https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=%d&type=1&oid=13662970"
data = requests.get(url % page,
headers=headerbili).text.encode("utf8")
comm=json.loads(data)
print(comm)
'''
其中ensure_ascii用来规定返回值是否可以包含非ASCII码。
中文超过ASCII码范围,修改ensure_ascii参数值
'''
with open("./json/{}.json".format(page),"w" ,encoding='utf-8') as
f:
f.write(json.dumps(comm, ensure_ascii=False))
for i in range(100):#100页
get_json(i)
print(i,'n')
# vedioreg = 'class="tap-router">(.*?)</a>'
# aurllist = re.findall(vedioreg, rank, re.S | re.M)
# print(aurllist)
like_level.py
import json
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
*#用来正常显示中文标签
*plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
#有中文出现的情况,需要u'内容'
like= []
level=[]
for e in range(100):
with open("./json/{}.json".format(e),"r",encoding="utf8") as f:
data=json.load(f)
for i in range(20):
like.append(data["data"]["replies"][i]["like"])
level.append(data["data"]["replies"][i]["member"]["level_info"]["current_level"])
print(like,level)
plt.bar(level,like)
plt.xlabel('用户等级')
plt.ylabel('点赞总数')
plt.show()
dejson.py
import json,csv
comment=[]*#评论列表
for e in range(100):
with open("./json/{}.json".format(e),"r",encoding="utf8") as f:
data=json.load(f)
for i in range(20):
comment.append(data["data"]["replies"][i]["content"]["message"])
with open("comment.txt","w",encoding="utf8") as fp:
writer = csv.writer(fp)
writer.writerow(comment)
print("写入", comment)#评论的txt文件
词频.py
import jieba,re
*#去除标点
*def get_text(file_name):
with open(file_name, 'r', encoding='utf-8') as fr:
text = fr.read()
#删除的标点
del_ch = ['《',',','》','n','。','、',';','"',
':',',','!','?',' ']
for ch in del_ch:
text = text.replace(ch,'')
return text
file_name = 'comment.txt'
text = get_text(file_name)
vlist = jieba.lcut(text)#调用jieba实现分词,返回列表
res_dict = {}
#进行词频统计
for i in vlist:
res_dict[i] = res_dict.get(i,0) + 1
res_list = list(res_dict.items())
#print(res_list)
#降序排序
res_list.sort(key = lambda x:x[1], reverse = True)
fin_res_list = []
#去除单个字的词
for item in res_list:
if(len(item[0])>=2):
fin_res_list.append(item)
word_list=[]
words=[]
for i in range(1000):
word,count = fin_res_list[i]
pstr = str(i+1) + ':'
word_list.append(word)
with open('ignore_dict.txt', 'r', encoding='utf-8') as f:
ignore_words = f.read().splitlines()
# 遍历分词
for word in word_list:
if word not in ignore_words:#排除词
word = re.sub(r'[n ]', '', word)
if len(word) < 1:
continue
words.append(word)
# print(pstr, end=' ')
# print(words[i], count)
with open("res.csv","a+")as fa:
fa.write(str(words[i])+","+str(count)+"n")
keshijua.py
import pandas as pd
import matplotlib.pyplot as plt
import wordcloud,jieba
plt.rcParams['font.sans-serif']=['SimHei']
#用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
#有中文出现的情况,需要u'内容'
#文件目录
file_path="res.csv"
df=pd.read_csv(file_path,encoding="gbk")
#柱状图
def _bar():
df.columns = ["word","frequency"]
print(df.head())
plt.bar(df["word"],df["frequency"])
plt.xlabel('词')
plt.ylabel('出现次数')
plt.show()
_bar()
#词云
with open("comment.txt","r",encoding="utf8")as f:
txt=f.read()
stopwords =
["的","了","我","时","在","你","看","到","没","不","就","是","人","也","有","和","会",
"一个","没有","时候","弹幕","自己","什么","时间","知道","就是","现在","真的","已经","还是","这个","看到","可以","因为"
,"你们","才能","不是","但是","那个","最后","每秒","所以","他们","觉得","怎么样","一样","可能","举报","这部","大家","不能","当时"
,"一直","一次","然后","还有","这样","评论","如果","那么","为什么","第一次","感谢","只是","这些","之后","忘记","一下","虽然","为了"
,"一定","今天","这么","不会","这里","去年","两个","以后","地方","那些","这种","怎么","其实","起来","应该","---","发生","只有","天气"
,"今年","很多","好像","所有","一部","出来","找到","之子","一遍","谢谢","告诉","东西","永远","的话","五块","一句","之前","过去","一年"
,"一天","终于","选择","对于","非常","突然"
]
w=wordcloud.WordCloud(background_color="white",font_path="msyh.ttc",height=600,width=800,stopwords=stopwords)
w.generate(" ".join(jieba.lcut(txt)))
w.to_file("词云.png")
忽略词
根据结果自己写
一个 没有 没有 时候 弹幕 自己 什么 时间 知道 就是 现在 真的 已经 还是 看到 可以 因为 你们 才能 不是 但是 每秒 所以 他们 觉得 怎么样 一样 举报 这部 大家 不能 当时 一直 一次 然后 还有 这样 评论 如果 那么 为什么 第一次 感谢 只是 这些 之后 忘记 一下 虽然 为了 一定 今天 这么 不会 这里 去年 两个 以后 地方 那些 这种 怎么 其实 起来 应该 --- 发生 只有 天气 今年 很多 好像 所有 一部 出来 找到 之子 一遍 谢谢 告诉 东西 永远 的话 五块 一句 之前 过去 一年 一天 终于 选择 对于 非常 突然
五、结论与心得
开始准备爬取taptap评论数据,根据下面博客分析
【Python】爬取TapTap原神评论并生成词云分析_includei的博客-CSDN博客_爬取taptap评论
最后发现评论数据地址需要滑动验证
但哔哩哔哩评论数据访问时不需要验证
六、参考文献
【Python】爬取TapTap原神评论并生成词云分析_includei的博客-CSDN博客_爬取taptap评论
WordCloud词云图去除停用词的正确方法_罗罗攀的博客-CSDN博客