主要是实战部分

豆瓣排行

主要目的是爬取豆瓣top250，并存储在Excel里。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import pprint

PageIndex = range(0, 250, 25)


def DownloadAllHtmls() -> list:
    """
    爬取10个页面的HTML
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Referer': 'https://wangzhilei.fun'
    }

    htmls = []
    for idx in PageIndex:
        url = f"https://movie.douban.com/top250?start={idx}&filter="
        # print(f"craw html: {url}")
        r = requests.get(url,headers=headers,timeout=3 )
        if r.status_code != 200:
            raise Exception("error")
        htmls.append(r.text)

    return htmls


def ParseSingleHtml(html):
    """
    解析单个HTML，得到数据
    :return: list('link','title',[label])
    """
    soup = BeautifulSoup(html,'html.parser')
    ArticleItems = (soup.find('div', class_ = 'article').find('ol',class_ = 'grid_view').find_all('div',class_ = 'item'))
    datas = []
    for ArticleItem in ArticleItems:
        rank = ArticleItem.find("div", class_ = 'pic').find("em",class_ = "").get_text()
        info = ArticleItem.find('div', class_ = 'info')
        title = info.find('div',class_ = 'hd').find('span',class_ = 'title').get_text()
        stars = (info.find('div',class_ = 'bd').find('div',class_ = 'star').find_all('span'))
        rating_star = stars[0]["class"][0]
        rating_num = stars[1].get_text()
        comments = stars[3].get_text()
        datas.append({
            "rank":rank,
            "title":title,
            "rating_star":rating_star.replace("rating","").replace("-t",""),
            "rating_num":rating_num,
            "comments":comments.replace("人评价",'')
        })
    return datas


htmls = DownloadAllHtmls()

AllDatas = []
for html in htmls:
    AllDatas.extend(ParseSingleHtml(html))
df = pd.DataFrame(AllDatas)
df.to_excel("豆瓣电影top250.xlsx")

得到豆瓣电影top250.xlsx

爬取岳麓区历史天气

涉及技术:

headers中设置user agent反爬机制
通过network抓包，分析ajax的请求和参数
通过for循环请求不同的参数的数据
利用pandas实现excel的合并与保存

from io import StringIO
import requests
import pandas as pd



def craw_table(year, month):
    """
    根据年月确定爬取的数据
    :param year:
    :param month:
    :return:
    """
    params = {
        "areaInfo[areaId]": 71952,
        "areaInfo[areaType]": 2,
        "date[year]": year,
        "date[month]": month,
    }

    response = requests.get(URL, headers=headers, params=params, timeout=3)
    data = response.json()["data"]
    # 使用StringIO包装HTML字符串
    df = pd.read_html(StringIO(data))[0]
    return df


URL = "https://tianqi.2345.com/Pc/GetHistory"


# 后来发现只要有"Cookie"和"Referer"这个键就行
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "Cookie": "positionCityID=71952; positionCityPinyin=yuelu; Hm_lvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578091; lastCountyId=71952; lastCountyPinyin=yuelu; lastProvinceId=23; lastCityId=57687; Hm_lpvt_a3f2879f6b3620a363bec646b7a8bcdd=1719578124; lastCountyTime=1719578124",
    "Referer": "https://tianqi.2345.com/wea_history/71952.htm",
}

df_list = []
for used_year in range(2020, 2024):
    for used_month in range(1, 13):
        print(f"{used_year} {used_month}")
        needed_df = craw_table(used_year, used_month)
        df_list.append(needed_df)
pd.concat(df_list).to_excel("岳麓区历史天气数据.xlsx", index=False)

得到岳麓区历史天气数据.xlsx

爬取小说

"""
爬取小说
"""

import requests
from bs4 import BeautifulSoup



def get_novel_chapters() ->list:
    global novel_name
    root_url = input()
    r = requests.get(root_url,timeout=3)
    r.encoding = "gbk"
    novel_soup = BeautifulSoup(r.text,"html.parser")
    chapters = []
    novel_name = novel_soup.find("div",id="info").find('h1').get_text()
    for dd in novel_soup.find_all("dd"):
        chapters_link = dd.find("a")
        if not chapters_link:
            continue
        chapters.append((f"{root_url}{chapters_link['href'][10:]}", chapters_link.text))
    return chapters


def get_chapter_content(url):
    r = requests.get(url,timeout=3)
    r.encoding = 'gbk'
    chapter_soup = BeautifulSoup(r.text, "html.parser")
    chapter_content = chapter_soup.find("div", id="content").get_text()
    chapter_content = chapter_content.replace('\xa0', ' ')
    return chapter_content




novel_chapters = get_novel_chapters()
for chapter in novel_chapters[9:]:
    chapter_url, chapter_title = chapter
    with open(f"{novel_name}.txt", "w") as fout:
        fout.write(chapter_title)
        fout.write("\n")
        fout.write(get_chapter_content(chapter_url))

批量爬取的时候好像IP地址被拉黑了，网站都打不开。。。
但代码应该没啥问题。