python"># 全部代码
import jieba
import jieba.posseg as psg
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import bs4
id=2188200
while id <=2188250:
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Connection':'keep-alive'
}
url='https://match.sports.sina.com.cn/livecast/9/iframe/scroll_iframe.php?opta_id='+str(id)
r = requests.get(url,headers=header, timeout = 30)
r.content.decode('utf-8')
soup = BeautifulSoup(r.content.decode('utf-8'), 'lxml')
pre = soup.find_all('tr')
if len(pre)<=5:
id+=1
continue
list_th=[]
list_td=[]
for x in pre:
pre0 = x.find('th')
result = pre0.get_text()
#print (result)
list_th.append(result)
pre1 = x.find_all('td')
if pre1 is not []:
result1 = pre1[0].get_text()
#print (result1)
list_td.append(result1)
#改过了
list_td_deal=[]
TT=list_td[0]
pattern = re.compile(r',(.*?) \d-\d (.*?),')
#用正则取队名
m = pattern.findall(TT)
teemlist=list(m[0])
name0=teemlist[0]
name1=teemlist[1]
for centos in list_td:
for x in psg.cut(centos):
#如果避免队名被分词
if x.word in teemlist[0] or x.word in teemlist[1]:
continue
if (x.flag =='nr' or x.flag =='nrt' or x.flag=='eng') and x.word not in teemlist:
if x.word != '谢谢' and x.word != '换人':
centos=centos.replace(x.word,'['+x.word +']')
centos=centos.replace(teemlist[0],'<'+teemlist[0] +'>').replace(teemlist[1],'<'+teemlist[1] +'>')
list_td_deal.append(centos)
#存储
df=pd.DataFrame({'时间':list_th,
'内容':list_td,
'分词后内容':list_td_deal})
df=df.reindex(index=df.index[::-1])
#去重
df.drop_duplicates(['时间','内容'],inplace=True)
df.to_excel(str(id)+'.xlsx')
id+=1
print (id)