再次尝试爬取自己的博客

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote

url = "http://www.wangzhilei.fun"

r=requests.get(url)

if r.status_code != 200:
raise Exception()
r.encoding ='utf-8'
html_doc=r.text
soup = BeautifulSoup(html_doc,"html.parser")
div_nodes = soup.find_all("div",class_ = "recent-post-info")

for div_node in div_nodes:
link = div_node.find("a")
print(unquote(link["href"]),link.get_text())# urllib.parse.unquote()函数解码
print("================================")


爬取成功!!!

爬取博客网站所有文章列表

知识点:

  • 附带cookie字典
  • 正则表达式实现模糊匹配
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import urllib
from urllib.parse import unquote
from utlis import url_manager
import requests
from bs4 import BeautifulSoup
import re

RootURL = 'https://wangzhilei.fun'

URLS = url_manager.UrlManager()
URLS.add_new_url(RootURL)

fout = open("craw_all_pages.txt", "w")
while URLS.has_new_url():
CurrUrl = URLS.get_url()
r = requests.get(CurrUrl, timeout=10)
if r.status_code != 200:
print('error')
continue
soup = BeautifulSoup(r.text,'html.parser')
title = soup.title.string

fout.write(f'{urllib.parse.unquote(CurrUrl)} {title}\n')
fout.flush()
print(f'success:{CurrUrl} {title} {len(URLS.new_urls)}')

links = soup.find_all('a')
for link in links:
href = link.get('href')
if href is None:
continue
pattern = r"/\d{4}/\d{2}/\d{2}/.*$"
if re.match(pattern, href):
full_url = urllib.parse.urljoin(RootURL, href)
# print(f'full_url:{unquote(full_url)}')
decoded_url = urllib.parse.unquote(full_url)
URLS.add_new_url(decoded_url)

fout.close()

输出如下:

得到的文档如下:

1
2
3
4
5
6
7
8
https://wangzhilei.fun NULL
https://wangzhilei.fun/2024/06/23/Python爬虫(一)/ Python爬虫(一) | NULL
https://wangzhilei.fun/2024/06/22/PAM4与OFDM代码分析/ PAM4与OFDM代码分析 | NULL
https://wangzhilei.fun/2024/05/30/test/ test | NULL
https://wangzhilei.fun/2024/06/25/Python爬虫(二)/ Python爬虫(二) | NULL
https://wangzhilei.fun/2024/06/22/OFDM时频图/ OFDM时频图 | NULL
https://wangzhilei.fun/2024/06/21/latex公式/ latex公式 | NULL

爬取成功!!!