主要是实战部分

豆瓣排行

主要目的是爬取豆瓣top250,并存储在Excel里。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pprint

PageIndex = range(0, 250, 25)


def DownloadAllHtmls() -> list:
"""
爬取10个页面的HTML
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://wangzhilei.fun'
}

htmls = []
for idx in PageIndex:
url = f"https://movie.douban.com/top250?start={idx}&filter="
# print(f"craw html: {url}")
r = requests.get(url,headers=headers,timeout=3 )
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)

return htmls


def ParseSingleHtml(html):
"""
解析单个HTML,得到数据
:return: list('link','title',[label])
"""
soup = BeautifulSoup(html,'html.parser')
ArticleItems = (soup.find('div', class_ = 'article').find('ol',class_ = 'grid_view').find_all('div',class_ = 'item'))
datas = []
for ArticleItem in ArticleItems:
rank = ArticleItem.find("div", class_ = 'pic').find("em",class_ = "").get_text()
info = ArticleItem.find('div', class_ = 'info')
title = info.find('div',class_ = 'hd').find('span',class_ = 'title').get_text()
stars = (info.find('div',class_ = 'bd').find('div',class_ = 'star').find_all('span'))
rating_star = stars[0]["class"][0]
rating_num = stars[1].get_text()
comments = stars[3].get_text()
datas.append({
"rank":rank,
"title":title,
"rating_star":rating_star.replace("rating","").replace("-t",""),
"rating_num":rating_num,
"comments":comments.replace("人评价",'')
})
return datas


htmls = DownloadAllHtmls()

AllDatas = []
for html in htmls:
AllDatas.extend(ParseSingleHtml(html))
df = pd.DataFrame(AllDatas)
df.to_excel("豆瓣电影top250.xlsx")

得到豆瓣电影top250.xlsx

爬取岳麓区历史天气

涉及技术:

  1. headers中设置user agent反爬机制
  2. 通过network抓包,分析ajax的请求和参数
  3. 通过for循环请求不同的参数的数据
  4. 利用pandas实现excel的合并与保存
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from io import StringIO
import requests
import pandas as pd



def craw_table(year, month):
"""
根据年月确定爬取的数据
:param year:
:param month:
:return:
"""
params = {
"areaInfo[areaId]": 71952,
"areaInfo[areaType]": 2,
"date[year]": year,
"date[month]": month,
}

response = requests.get(URL, headers=headers, params=params, timeout=3)
data = response.json()["data"]
# 使用StringIO包装HTML字符串
df = pd.read_html(StringIO(data))[0]
return df


URL = "https://tianqi.2345.com/Pc/GetHistory"


# 后来发现只要有"Cookie"和"Referer"这个键就行
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Cookie": "positionCityID=71952; positionCityPinyin=yuelu; Hm_lvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578091; lastCountyId=71952; lastCountyPinyin=yuelu; lastProvinceId=23; lastCityId=57687; Hm_lpvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578124; lastCountyTime=1719578124",
"Referer": "https://tianqi.2345.com/wea_history/71952.htm",
}

df_list = []
for used_year in range(2020, 2024):
for used_month in range(1, 13):
print(f"{used_year} {used_month}")
needed_df = craw_table(used_year, used_month)
df_list.append(needed_df)
pd.concat(df_list).to_excel("岳麓区历史天气数据.xlsx", index=False)

得到岳麓区历史天气数据.xlsx

爬取小说

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
爬取小说
"""

import requests
from bs4 import BeautifulSoup



def get_novel_chapters() ->list:
global novel_name
root_url = input()
r = requests.get(root_url,timeout=3)
r.encoding = "gbk"
novel_soup = BeautifulSoup(r.text,"html.parser")
chapters = []
novel_name = novel_soup.find("div",id="info").find('h1').get_text()
for dd in novel_soup.find_all("dd"):
chapters_link = dd.find("a")
if not chapters_link:
continue
chapters.append((f"{root_url}{chapters_link['href'][10:]}", chapters_link.text))
return chapters


def get_chapter_content(url):
r = requests.get(url,timeout=3)
r.encoding = 'gbk'
chapter_soup = BeautifulSoup(r.text, "html.parser")
chapter_content = chapter_soup.find("div", id="content").get_text()
chapter_content = chapter_content.replace('\xa0', ' ')
return chapter_content




novel_chapters = get_novel_chapters()
for chapter in novel_chapters[9:]:
chapter_url, chapter_title = chapter
with open(f"{novel_name}.txt", "w") as fout:
fout.write(chapter_title)
fout.write("\n")
fout.write(get_chapter_content(chapter_url))

批量爬取的时候好像IP地址被拉黑了,网站都打不开。。。
但代码应该没啥问题。