import xlrd from bs4 import BeautifulSoup import re import urllib.request, urllib.error import requests import xlwt import sqlite3 from xlutils.copy import copy
def getData(url): header = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
} head = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Language: zh-CN,zh;q=0.9 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 """ html = "" try: proxy = {'http': 'http://127.0.0.1:10809','https': 'http://127.0.0.1:10809'} req = requests.get(url=url, headers=header,proxies=proxy) html = req.text bs = BeautifulSoup(html, "html.parser") a = bs.find_all("div", class_="item")
findLink = re.compile(r'<a href="(.*)">') findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S) findOther = re.compile(r'<span class="other">(.*?)</span>', re.S) findSrc = re.compile(r'<img.*?src="(.*?)"', re.S) findMaster = re.compile(r'<p class="">(.*?)</p>', re.S) findAvg = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findPersons = re.compile(r'<span>(\d*)人评价') findDetail = re.compile(r'<span class="inq">(.*?)</span>') ''' findSrc = re.compile(r'<img.*src="(.*)"') ['https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" width="100'] ''' dataList = [] for item in a: print(type(item)) item = str(item) data = [] title = re.findall(findTitle, item) if len(title) == 2: ctitle = title[0] data.append(ctitle) otitle = title[1].replace(u'\xa0', u'').replace("/", "") data.append(otitle) else: data.append(title[0]) data.append(" ")
data.append((re.findall(findOther, item))[0].replace(u'\xa0', u'').replace("/", '-').replace(' ', '')) data.append(re.findall(findLink, item)[0]) data.append(re.findall(findSrc, item)[0]) data.append( re.findall(findMaster, item)[0].replace("...<br/>", "").replace("\n", "").replace(" ", "").replace( "\xa0", "")) data.append(re.findall(findAvg, item)[0]) data.append(re.findall(findPersons, item)[0]) if len(re.findall(findDetail, item)) != 0: data.append(re.findall(findDetail, item)[0].replace("。", "")) else: data.append("暂无评论") dataList.append(data) return dataList
except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html
def getAll(url): savepath = "豆瓣电影Top250.xls" excel = xlwt.Workbook(encoding="utf-8",style_compression=0) sheet = excel.add_sheet("豆瓣电影top250",cell_overwrite_ok=True) col = ("电影名","别名1","别名2","电影链接","图片链接","演员表","评分","评论人数","短评语") for item in range(0,9): sheet.write(0,item,col[item]) listarr = [] for i in range(0, 10): url2 = url + str(i * 25) dataList = getData(url2) listarr.append(dataList) arr = [] for items in listarr: for item in items: arr.append(item) print(len(arr)) for items in range(0,len(arr)): data = arr[items] for item in range(0,len(data)): sheet.write(items+1,item,arr[items][item]) excel.save(savepath)
def getHtml(url): header = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" } proxy_handler = urllib.request.ProxyHandler({'http': '127.0.0.1:10810'}) opener = urllib.request.build_opener(proxy_handler) r = opener.open(url=url, headers=header) html = r.read().decode('utf-8', 'ignore') return html
def main(): url = "https://movie.douban.com/top250?start=" url2 = 'https://sec.douban.com/a?c=a6b706&d="+d+"&r=https%3A%2F%2Fmovie.douban.com%2Ftop250%3Fstart%3D0&k=a6wmDk9GyuEKktSqamFs3h3441rJCo%2F2%2FPK%2FLTqRk0k' getAll(url)
|