主要是实战部分
豆瓣排行
主要目的是爬取豆瓣top250,并存储在Excel里。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| import requests from bs4 import BeautifulSoup import pandas as pd import pprint
PageIndex = range(0, 250, 25)
def DownloadAllHtmls() -> list: """ 爬取10个页面的HTML """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Referer': 'https://wangzhilei.fun' }
htmls = [] for idx in PageIndex: url = f"https://movie.douban.com/top250?start={idx}&filter=" r = requests.get(url,headers=headers,timeout=3 ) if r.status_code != 200: raise Exception("error") htmls.append(r.text)
return htmls
def ParseSingleHtml(html): """ 解析单个HTML,得到数据 :return: list('link','title',[label]) """ soup = BeautifulSoup(html,'html.parser') ArticleItems = (soup.find('div', class_ = 'article').find('ol',class_ = 'grid_view').find_all('div',class_ = 'item')) datas = [] for ArticleItem in ArticleItems: rank = ArticleItem.find("div", class_ = 'pic').find("em",class_ = "").get_text() info = ArticleItem.find('div', class_ = 'info') title = info.find('div',class_ = 'hd').find('span',class_ = 'title').get_text() stars = (info.find('div',class_ = 'bd').find('div',class_ = 'star').find_all('span')) rating_star = stars[0]["class"][0] rating_num = stars[1].get_text() comments = stars[3].get_text() datas.append({ "rank":rank, "title":title, "rating_star":rating_star.replace("rating","").replace("-t",""), "rating_num":rating_num, "comments":comments.replace("人评价",'') }) return datas
htmls = DownloadAllHtmls()
AllDatas = [] for html in htmls: AllDatas.extend(ParseSingleHtml(html)) df = pd.DataFrame(AllDatas) df.to_excel("豆瓣电影top250.xlsx")
|
得到豆瓣电影top250.xlsx
爬取岳麓区历史天气
涉及技术:
- headers中设置user agent反爬机制
- 通过network抓包,分析ajax的请求和参数
- 通过for循环请求不同的参数的数据
- 利用pandas实现excel的合并与保存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
| from io import StringIO import requests import pandas as pd
def craw_table(year, month): """ 根据年月确定爬取的数据 :param year: :param month: :return: """ params = { "areaInfo[areaId]": 71952, "areaInfo[areaType]": 2, "date[year]": year, "date[month]": month, }
response = requests.get(URL, headers=headers, params=params, timeout=3) data = response.json()["data"] df = pd.read_html(StringIO(data))[0] return df
URL = "https://tianqi.2345.com/Pc/GetHistory"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", "Cookie": "positionCityID=71952; positionCityPinyin=yuelu; Hm_lvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578091; lastCountyId=71952; lastCountyPinyin=yuelu; lastProvinceId=23; lastCityId=57687; Hm_lpvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578124; lastCountyTime=1719578124", "Referer": "https://tianqi.2345.com/wea_history/71952.htm", }
df_list = [] for used_year in range(2020, 2024): for used_month in range(1, 13): print(f"{used_year} {used_month}") needed_df = craw_table(used_year, used_month) df_list.append(needed_df) pd.concat(df_list).to_excel("岳麓区历史天气数据.xlsx", index=False)
|
得到岳麓区历史天气数据.xlsx
爬取小说
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| """ 爬取小说 """
import requests from bs4 import BeautifulSoup
def get_novel_chapters() ->list: global novel_name root_url = input() r = requests.get(root_url,timeout=3) r.encoding = "gbk" novel_soup = BeautifulSoup(r.text,"html.parser") chapters = [] novel_name = novel_soup.find("div",id="info").find('h1').get_text() for dd in novel_soup.find_all("dd"): chapters_link = dd.find("a") if not chapters_link: continue chapters.append((f"{root_url}{chapters_link['href'][10:]}", chapters_link.text)) return chapters
def get_chapter_content(url): r = requests.get(url,timeout=3) r.encoding = 'gbk' chapter_soup = BeautifulSoup(r.text, "html.parser") chapter_content = chapter_soup.find("div", id="content").get_text() chapter_content = chapter_content.replace('\xa0', ' ') return chapter_content
novel_chapters = get_novel_chapters() for chapter in novel_chapters[9:]: chapter_url, chapter_title = chapter with open(f"{novel_name}.txt", "w") as fout: fout.write(chapter_title) fout.write("\n") fout.write(get_chapter_content(chapter_url))
|
批量爬取的时候好像IP地址被拉黑了,网站都打不开。。。
但代码应该没啥问题。