再次尝试爬取自己的博客

import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote

url = "http://www.wangzhilei.fun"

r=requests.get(url)

if r.status_code != 200:
    raise Exception()
r.encoding ='utf-8'
html_doc=r.text
soup = BeautifulSoup(html_doc,"html.parser")
div_nodes = soup.find_all("div",class_ = "recent-post-info")

for div_node in div_nodes:
    link = div_node.find("a")
    print(unquote(link["href"]),link.get_text())# urllib.parse.unquote()函数解码
    print("================================")

爬取成功！！！

爬取博客网站所有文章列表

知识点：

附带cookie字典
正则表达式实现模糊匹配

import urllib
from urllib.parse import unquote
from utlis import url_manager
import requests
from bs4 import BeautifulSoup
import re

RootURL = 'https://wangzhilei.fun'

URLS = url_manager.UrlManager()
URLS.add_new_url(RootURL)

fout = open("craw_all_pages.txt", "w")
while URLS.has_new_url():
    CurrUrl = URLS.get_url()
    r = requests.get(CurrUrl, timeout=10)
    if r.status_code != 200:
        print('error')
        continue
    soup = BeautifulSoup(r.text,'html.parser')
    title = soup.title.string

    fout.write(f'{urllib.parse.unquote(CurrUrl)} {title}\n')
    fout.flush()
    print(f'success:{CurrUrl} {title} {len(URLS.new_urls)}')

    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is None:
            continue
        pattern = r"/\d{4}/\d{2}/\d{2}/.*$"
        if re.match(pattern, href):
            full_url = urllib.parse.urljoin(RootURL, href)
            # print(f'full_url:{unquote(full_url)}')
            decoded_url = urllib.parse.unquote(full_url)
            URLS.add_new_url(decoded_url)

fout.close()

输出如下：

得到的文档如下：

https://wangzhilei.fun NULL
https://wangzhilei.fun/2024/06/23/Python爬虫（一）/ Python爬虫（一） | NULL
https://wangzhilei.fun/2024/06/22/PAM4与OFDM代码分析/ PAM4与OFDM代码分析 | NULL
https://wangzhilei.fun/2024/05/30/test/ test | NULL
https://wangzhilei.fun/2024/06/25/Python爬虫（二）/ Python爬虫（二） | NULL
https://wangzhilei.fun/2024/06/22/OFDM时频图/ OFDM时频图 | NULL
https://wangzhilei.fun/2024/06/21/latex公式/ latex公式 | NULL

爬取成功！！！