爬虫抓取新浪足球文字直播

python"># 全部代码
import jieba
import jieba.posseg as psg
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import bs4

id=2188200
while id <=2188250:
    header={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'Connection':'keep-alive'
        }

    url='https://match.sports.sina.com.cn/livecast/9/iframe/scroll_iframe.php?opta_id='+str(id)

    r = requests.get(url,headers=header, timeout = 30)

    r.content.decode('utf-8')
    
    
    soup = BeautifulSoup(r.content.decode('utf-8'), 'lxml')

    pre = soup.find_all('tr')

    if len(pre)<=5:
        id+=1
        continue
    
    
    list_th=[]
    list_td=[]
    for x in pre:
        pre0 = x.find('th')
        result = pre0.get_text()
        #print (result)
        list_th.append(result)
        pre1 = x.find_all('td')
        if pre1 is not []:

            result1 = pre1[0].get_text()
            #print (result1)
            list_td.append(result1)
    
    
    #改过了

    list_td_deal=[]
    TT=list_td[0]
    pattern = re.compile(r',(.*?) \d-\d (.*?),') 
    #用正则取队名
    m = pattern.findall(TT)     
    teemlist=list(m[0])
    name0=teemlist[0]
    name1=teemlist[1]
     for centos in list_td:
        for x in psg.cut(centos):
            #如果避免队名被分词
            if x.word  in  teemlist[0] or x.word  in  teemlist[1]:
                continue
            
            if (x.flag =='nr' or  x.flag =='nrt' or  x.flag=='eng') and  x.word not in  teemlist:
                if x.word != '谢谢' and  x.word != '换人':
                    centos=centos.replace(x.word,'['+x.word +']')
        centos=centos.replace(teemlist[0],'<'+teemlist[0] +'>').replace(teemlist[1],'<'+teemlist[1] +'>') 
        list_td_deal.append(centos)  
    
    #存储
    df=pd.DataFrame({'时间':list_th,
                        '内容':list_td,
                       '分词后内容':list_td_deal})
    df=df.reindex(index=df.index[::-1])
    #去重
    df.drop_duplicates(['时间','内容'],inplace=True)
    df.to_excel(str(id)+'.xlsx')    
    
    id+=1
    print (id)

爬虫抓取新浪足球文字直播

相关文章

两种骨架提取的方法（color.rgb2gray和CV2）

2020金属非金属矿山安全检查（露天矿山）考试题库及金属非金属矿山安全检查（露天矿山）证考试

2020机修钳工（高级）考试题库及机修钳工（高级）复审模拟考试

pyinstaller打包踩坑报错PermissionError: [Errno 13] Permission denied....

2020硝化工艺考试及硝化工艺模拟考试

使用matplotlib 绘制动图动画 FuncAnimation 解决jupyter中动图不动

2020年金属非金属矿山提升机操作多少钱及金属非金属矿山提升机操作复审考试

python输入坐标返回形成的多边形，并输出面积和动画