将爬取的数据存储至Excel表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
1 # -*- condeing = utf-8 -*- 2 # @Time:2021/7/20 23:40 3 # @Author : lmb 4 # @File:.py 5 # @Software: 6 7 import urllib.request,urllib.error 8 import re 9 from bs4 import BeautifulSoup 10 import xlwt 11 12 #爬取多条(实例:豆瓣TOP250) 13 def GetUrl(baseurl): 14 datalist = []#存储爬取到的整体信息在列表datalist 15 #1.爬取网页 16 for i in range(0,10): 17 url=baseurl+str(i*25) 18 html=AskUrl(url)#保存获取的网页源码 19 20 #2.逐一解析数据 21 soup=BeautifulSoup(html,"html.parser") 22 for item in soup.find_all('div',class_="item"): 23 #print(item) 24 data = [] #保存一部电影的信息 25 item=str(item) 26 27 name = re.findall(findName, item)[0] #爬取电影名 28 data.append(name) 29 30 link=re.findall(findLink,item)[0] #爬取电影链接 31 data.append(link) 32 33 datalist.append(data) #存到汇总的列表里 34 return datalist #并返回 35 36 37 #爬取一条网页 38 def AskUrl(url): 39 #模拟浏览器 40 head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4542.2 Safari/537.36"} 41 reqs=urllib.request.Request(url,headers=head) 42 html=""; 43 try: 44 resposed=urllib.request.urlopen(reqs) 45 html=resposed.read().decode("utf-8") 46 #print(html) 47 48 except urllib.error.URLError as e: 49 if hasattr(e,"code"): 50 print(e.code) 51 if hasattr(e,"reason"): 52 print(e.reson) 53 return html 54 55 # #保存数据 56 def savaData(datelist,savepath): 57 print("save...") 58 book=xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象 59 sheet=book.add_sheet('豆瓣TOP250',cell_overwrite_ok=True) #创建sheet工作表 60 col=("电影名称","电影链接")#列表名 61 #个网站的html代码中可能有的标识为空,只要加上try.....except 错误机制跳过空值就行了,所以为了避免发生报空值错误 62 try: 63 for i in range(0,2): 64 sheet.write(0,i,col[i]) #插入列名 65 for i in range(0,250):#遍历 66 print("第%d条"%(i+1)) 67 date=datelist[i] 68 for j in range(0,2): 69 sheet.write(i+1,j,date[j]) #参数是第i+1行第j列存入datelist[i]爬取数据(因为第一列是显示列名) 70 71 book.save('豆瓣TOP250.xls')#保存数据 72 except IndexError: 73 pass 74 75 if __name__ == '__main__': 76 findLink = re.compile(r'<a href="(.*?)">') # 使用正则表达式,"r"忽视一些符号 77 findName=re.compile(r'<span class="title">(.*?)</span>') 78 79 datalist=GetUrl("https://movie.douban.com/top250?start=") 80 savapath = ".\豆瓣TOP250.xls" 81 savaData(datalist,savapath) |