《戰(zhàn)狼Ⅱ》破50億了,你還不知道它在說啥?本文通過Python爬蟲抓取獲取12萬條影評分析,告訴你《戰(zhàn)狼Ⅱ》用什么撩到了你。
import
requests import re import pandas as pd url_first='https://movie.douban.com/subject/26363254/comments?start=0' head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'} html=requests.get(url_first,headers=head,cookies=cookies) cookies={'cookie':'你自己的cookie'} #也就是找到你的賬號對應(yīng)的cookie reg=re.compile(r'.*?.*?(.*?).*?(.*?).*?title="(.*?)">.*?title="(.*?)">.*?class=""> (.*?)\n',re.S) #評論等內(nèi)容 while html.status_code==200: url_next='https://movie.douban.com/subject/26363254/comments'+re.findall(reg,html.text)[0] zhanlang=re.findall(ren,html.text) data=pd.DataFrame(zhanlang) data.to_csv('/home/wajuejiprince/文檔/zhanlang/zhanlangpinglun.csv', header=False,index=False,mode='a+') #寫入csv文件,'a+'是追加模式 data=[] zhanlang=[] html=requests.get(url_next,cookies=cookies,headers=head)
library(data.table)
library(plotly)
library(stringr)
library(jiebaR)
library(wordcloud2)
library(magrittr)
dt<-fread(file.choose()) #導(dǎo)入數(shù)據(jù)
dt[,c("V8","V9","V10","V11","V12","V13"):=NULL] #刪除空列
#一條命令清洗數(shù)據(jù)
my_dt<-dt[str_detect(贊成評論數(shù),"\\d+")][評論有用=='有用'][是否看過=="看過"][五星數(shù)%in%c("很差","較差","還行","推薦","力薦")]
wk <- worker()
sw<-function(x){wk<=x}
segwords<-lapply(my_dt[,評論內(nèi)容],sw)
my_segwords<-unlist(segwords) #不要列表
#去除停止詞
st<-readLines(file.choose()) #讀取停止詞
stopwords<-c(NULL)
for(i in 1:length(st))
{
stopwords[i]<-st[i]
}
seg_Words<-filter_segment(my_segwords,stopwords) #去除中文停止詞
words<-table(seg_Words)%>%data.table()
setnames(words,"N","pinshu")
words[pinshu>1000] #去除較低頻數(shù)的詞匯(小于1000的)
wordcloud2(words[pinshu>1000], size = 2, fontFamily = "微軟雅黑",color = "random-light", backgroundColor = "grey")
由于數(shù)據(jù)太多,導(dǎo)致我的破電腦卡頓,所以在制作云圖的時候去掉了頻數(shù)低于1000的詞匯。