Skip to content

Latest commit

 

History

History
802 lines (651 loc) · 27.2 KB

File metadata and controls

802 lines (651 loc) · 27.2 KB
Error in user YAML: (<unknown>): found a tab character that violate indentation while scanning a plain scalar at line 3 column 3
---
- oeasy Python 0547
- 这是 oeasy 系统化 Python 教程,从基础一步步讲,扎实、完整、不跳步。愿意花时间学,就能真正学会。
- 本教程同步发布在: 
	- 个人网站: `https://oeasy.org` 
	- 蓝桥云课: `https://www.lanqiao.cn/courses/3584` 
	- GitHub: `https://github.com/overmind1980/oeasy-python-tutorial` 
	- Gitee: `https://gitee.com/overmind1980/oeasypython` 
---

爬取图片数据

回忆

  • 上次爬了汉字源
  • 图片不只是可以单独存在
  • 也可以嵌入到网页中的
  • 应该注意到请求头里面可以放东西
  • 爬取的时候可以利用原来网页整体的结构

观察豆瓣电影250

firefox http://movie.douban.com/top250

图片描述

分页爬

# 导包
import requests
from lxml import etree      # 用lxml解析器生成的对象中的xpath方法
from time import sleep
import csv
import numpy as np


# 指定URL
# url = 'https://movie.douban.com/top250'
# 进行UA伪装
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.41'}

# 定义空列表存放电影数据
tiltes_cn = []       # 中文标题
titles_en = []       # 英文标题
links = []           # 详情页链接
director = []        # 导演
actors = []          # 演员
years = []           # 上映年份
nations = []         # 国家和地区
types = []           # 类型
scores = []           # 评分
rating_nums = []      # 评分人数

fp = open('douban_top250.csv','w',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(['电影中文名', '电影英文名','电影详情页链接','导演','演员','上映年份','国家和地区','类型','评分','评分人数'])

for i in range(0,226,25):
    url = f'https://movie.douban.com/top250?start={i}&filter='
    # 将URL中的参数封装到字典中
    # data = {
    #     'start':i,            # 设置start参数
    #     'filter':'',
    # }
    # 发起请求,获取网页响应
    response = requests.get(url,headers=headers
                            # ,data=data
                           )
    sleep(1)
    # print(response.status_code)
    # print(response.encoding)
    # print(response.text)
    
    # 获取响应内容
    html = response.text  
    # 实例化一个etree对象
    data = etree.HTML(html)
    
    # 所有电影信息都在li标签下,所以我们可以先定位到li标签,在通过循环获取每一个li标签中的信息
    li_list = data.xpath('//*[@id="content"]/div/div[1]/ol/li')
    # 通过循环遍历每一页中的所有li标签,获取该页面所有电影的数据
    for each in li_list:
        # 中文标题
        title1 = each.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] 
        tiltes_cn.append(title1)
        
        # 英文标题
        # 每次获取到的数据存在一个列表中,通过下标索引取列表的值
        # 通过字符串的strip()方法去除字符串首尾的指定字符串
        title2 = each.xpath('./div/div[2]/div[1]/a/span[2]/text()')[0].strip('\xa0/\xa0')
        titles_en.append(title2)
        
        # 链接
        link = each.xpath('./div/div[2]/div[1]/a/@href')[0]     
        links.append(link)
        
        # 导演、主演
        info1 = each.xpath('./div/div[2]/div[2]/p[1]/text()[1]')[0].strip()       # 通过strip方法去除字符串的前后空格
        split_info1 = info1.split('\xa0\xa0\xa0')      # 通过指定字符串分割字符串
        dirt = split_info1[0].strip('导演: ')
        director.append(dirt)
        # 有些电影的主演为空,所以需要进行条件判断
        # 如果导演和主演信息都有,则获取主演信息
        if len(split_info1) == 2:
            ac = split_info1[1].strip('主演: ')
            actors.append(ac)
        # 如果没有主演信息,则将其信息设置为空
        else:
            actors.append(np.nan)
        
        # 年份、国籍、类型
        info2 = each.xpath('./div/div[2]/div[2]/p[1]/text()[2]')[0].strip()    # 去除字符串首尾的空格
        split_info2 = info2.split('\xa0/\xa0')    # 通过字符串分割获取字符串中的年份、国籍和类型
        # print(split_info)
        year = split_info2[0]
        nation = split_info2[1]
        ftype = split_info2[2]
        years.append(year)
        nations.append(nation)
        types.append(ftype)
        
        # 电影评分
        score = each.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]
        scores.append(score)
        
        # 获取电影打分人数
        num = each.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0].strip('人评价')
        rating_nums.append(num)
        
        writer.writerow([title1,title2,link,dirt,ac,year,nation,ftype,score,num])
    print(f'————————————第{int((i/25)+1)}页爬取完毕!——————————————')
        
        
    
fp.close()   # 写入完成后,关闭文件
print('——————————————————————————————————爬虫结束!!!!!————————————————————————————————————————————————')

观察结果

sudo apt install csvtool
  • 安装后观察国别
csvtool col 1,7 douban_top250.csv
  • 效果

图片描述

  • 评分谁最高?谁最低?

评分

import pandas as pd

df = pd.read_csv('douban_top250.csv', sep=",")

score = df['评分'].max()
print("最高评分的电影是:", df[df['评分'] == score]['电影中文名'].iloc[0], "\t分数是:", score)
score = df['评分'].min()
print("最低评分的电影是:", df[df['评分'] == score]['电影中文名'].iloc[0], "\t分数是:", score)
  • 评分高低

图片描述

  • 评分人数
    • 谁高谁低?

评分人数

import pandas as pd

df = pd.read_csv('douban_top250.csv', sep=",")

num = df['评分人数'].max()
print("最多人评分的电影是:", df[df['评分人数'] == num]['电影中文名'].iloc[0], "\t人数是:", num)
num = df['评分人数'].min()
print("最少人评分的电影是:", df[df['评分人数'] == num]['电影中文名'].iloc[0], "\t人数是:", num)
  • 评分人数

图片描述

  • 各种类型
    • 谁多谁少?

安装类库

  • 安装词云和分词包
pip3 install wordcloud
pip3 install jieba

类型

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import csv

def segment(text):
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,。?!:;”“’‘-=《》()、【】↓LO':
        text = text.replace(ch, " ")
    words_list_jieba = jieba.lcut(text)
    words_list = []
    for i in words_list_jieba:
        words_list.append(i)
    return words_list

def words_frequency(words_list):
    words_dict = {}
    for key in words_list:
        words_dict[key] = words_dict.get(key, 0) + 1
    return words_dict

def words_dict_sort(words_dict, descending):
    temp_dict = sorted(words_dict.items(), key=lambda x: x[1], reverse=descending)
    return temp_dict

df = pd.read_csv('douban_top250.csv', sep=",")
text = ' '.join(df['类型'].dropna())
words_list = segment(text)
words_dict = words_frequency(words_list)
print(words_dict)
  1. 将标点替换为空格
  2. 将字符串分词生成词列表
  3. 将词列表生成词频字典
  4. 将词频字典输出

图片描述

结果

  • 结果可以生成
    • 柱饼折
    • 词云图

图片描述

  • 想要得到
    • 关于国家和地区的
    • 词频字典

数据清洗

  • 如果想要将一些词汇
    • 从结果中清洗呢?
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import csv

def segment(text):
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,。?!:;”“’‘-=《》()\n【】↓LO':
        text = text.replace(ch, " ")
    words_list_jieba = jieba.lcut(text)
    words_list = []
    for i in words_list_jieba:
        if i in ["你", "UNG", "她", "三娘", "这位", "微博", "视角", "0", "秀英", "很", "热词", "两会", "看","这些", "聊", "我们", "关于", "要", "视频", "这", "那", "有", "不", "在", "却", "都","就", "是", "就是", "我", "他", "它", "他们","它们", "的", "着", "了", "和", "与", "个", "一个", "一","《", "》", " ", "\n"] or i.isnumeric():
            pass
        else:
            words_list.append(i)
    return words_list

def words_frequency(words_list):
    words_dict = {}
    for key in words_list:
        words_dict[key] = words_dict.get(key, 0) + 1
    return words_dict

def words_dict_sort(words_dict, descending):
    temp_dict = sorted(words_dict.items(), key=lambda x: x[1], reverse=descending)
    return temp_dict

df = pd.read_csv('douban_top250.csv', sep=",")

text = ' '.join(df['国家和地区'].dropna())
words_list = segment(text)
words_dict = words_frequency(words_list)
wordcloud = WordCloud(font_path="/usr/share/fonts/truetype/wqy-zenhei.ttc",  # 为显示正确的中文,需指定字体
                      background_color="white",  # 背景颜色
                      collocations=False,
                      width=800,
                      height=600,
                      margin=2,  # 设置图片默认的大小
                      ).fit_words(words_dict)
wordcloud.to_file('nation.png')

  • 可以把某些单词
    • 排除

最终结果

  • 首先声明
    • 香港地区、澳门地区、台湾地区是
    • 中国不可分割的一部分

图片描述

  • 如果想要将
    • 中国大陆、中国香港、中国台湾
    • 作为地区 分类统计

添加词汇

  • 将三个单词加入字典词条
jieba.add_word("中国大陆", freq=None, tag=None)
jieba.add_word("中国台湾", freq=None, tag=None)
jieba.add_word("中国香港", freq=None, tag=None)
  • 结果

图片描述

  • 期待第八、九艺术起飞

数据分析

import bs4
import requests

import json

import re
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import Counter

rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体字体,确保你的系统中有这个字体
rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105'}

movies = []

for page in range(1, 11):
    offset = (page - 1) * 25  # 计算偏移量
    url = f'https://movie.douban.com/top250?start={offset}&filter='

    resp = requests.get(url, headers=headers)

    if resp.status_code != 200:
        print(f"Failed to fetch page {page}. Status code: {resp.status_code}")
        continue

        # 创建BeautifulSoup对象
    soup = bs4.BeautifulSoup(resp.text, 'lxml')

    movie_items = soup.select('div.item')  # 选择每个电影条目的容器

    for item in movie_items:
        title_a = item.select_one('div.info > div.hd > a')  # 选择标题
        rank_span = item.select_one('div.info > div.bd > div.star > span.rating_num')  # 选择评分
        inq = item.select_one('div.bd > p:nth-of-type(1)')  # 选择第一个简短介绍,可能包含导演和主演

        if title_a and rank_span:
            movie_info = {
                'title': title_a.text,
                'rating': rank_span.text if rank_span else 'N/A',
                'leader_info': inq.text.strip() if inq else 'N/A'
            }
            print(movie_info)
            movies.append(movie_info)
        # 将movies列表保存为JSON文件
        with open('豆瓣电影TOP250.json', 'w', encoding='utf-8') as f:
            json.dump(movies, f, ensure_ascii=False, indent=4)  # indent=4用于美观的格式化输出

        print("Movies have been saved to movies.json")


# 读取JSON文件
with open('豆瓣电影TOP250.json', 'r', encoding='utf-8') as f:
    movies = json.load(f)

# 绘制评分直方图
ratings = [movie['rating'] for movie in movies]
plt.figure(figsize=(10, 6))
plt.hist(ratings, bins=len(set(ratings)), align='mid', rwidth=0.8, color='skyblue', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Movie Ratings')
plt.xticks(rotation=45)  # 如果标签太长,可以旋转它们
plt.grid(axis='y', alpha=0.75)
plt.show()

# 绘制一个条形图来显示电影名称和评分
names = [movie['title'].strip().split('\n')[0] for movie in movies]
ratings = [float(movie['rating']) for movie in movies]

# 创建条形图
plt.figure(figsize=(500, 6))
plt.bar(names, ratings)
plt.xlabel('Movie')
plt.ylabel('Rating')
plt.title('Movie Ratings')
plt.xticks(rotation=90)  # 旋转x轴标签以避免重叠
plt.show()

# 提取年份列表
years = []
for movie in movies:
    # 使用正则表达式匹配年份
    match = re.search(r'\n\s*(\d{4})\s*[/—-]*', movie['leader_info'])
    if match:
        # 如果匹配成功,提取年份并添加到列表中
        year = int(match.group(1))
        years.append(year)

    # 绘制年份直方图
plt.figure(figsize=(10, 6))
plt.hist(years, bins=range(min(years) - 2, max(years) + 3, 1), align='mid', rwidth=0.8, color='skyblue',
         edgecolor='black')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Distribution of Movie Release Years')
plt.xticks(rotation=45)  # 如果标签太长,可以旋转它们
plt.grid(axis='y', alpha=0.75)
plt.show()

# 提取所有国家并统计次数
countries = []
for movie in movies:
    # 匹配年份后的所有非空白、非分隔符的国家名称
    match = re.search(r'\d{4}\s*[/—-]*\s*([^\s/—-]+(?:\s+[^\s/—-]+)*)', movie['leader_info'])
    if match:
        # 分割匹配到的字符串以获取每个国家
        countries.extend(match.group(1).split())

    # 使用Counter统计次数
country_counts = Counter(countries)

# 绘制直方图
plt.figure(figsize=(10, 6))
plt.bar(country_counts.keys(), country_counts.values(), align='center')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.title('Country Distribution')
plt.xticks(rotation=45)  # 如果标签太长,可以旋转它们
plt.tight_layout()  # 自动调整子图参数,使之填充整个图像区域
plt.show()

# 提取年份和评分
years_ratings = []
for movie in movies:
    # 使用正则表达式匹配年份
    match_year = re.search(r'\n\s*(\d{4})\s*[/—-]*', movie['leader_info'])
    if match_year:
        year = int(match_year.group(1))
        rating = float(movie['rating'])
        years_ratings.append((year, rating))

    # 绘制年份与评分的散点图
plt.figure(figsize=(10, 6))
years, ratings = zip(*years_ratings)  # 解压年份和评分
plt.scatter(years, ratings, color='skyblue', edgecolor='black')
plt.xlabel('Year')
plt.ylabel('Rating')
plt.title('Year vs Rating Scatter Plot')
plt.xticks(rotation=45)  # 如果标签太长,可以旋转它们
plt.grid(axis='y', alpha=0.75)
plt.show()

效果

  • 评分和年代的关系

图片描述

图片描述

图片描述

豆瓣评论

import requests
from lxml import etree
#import pandas as pd
#import time

def page(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/116.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
               # 'Accept-Encoding': 'gzip, deflate, br',
               'Connection': 'keep-alive',
               'Referer': 'https://movie.douban.com/',
               'Upgrade-Insecure-Requests': '1',
               'Sec-Fetch-Dest': 'iframe',
               'Sec-Fetch-Mode': 'navigate',
               'Sec-Fetch-Site': 'cross-site',
               'Cookie': 'll="118381"; bid=u8W938yW3Rw; __utma=30149280.1427340690.1714406596.1714476685.1714485673.6; __utmz=30149280.1714485673.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; dbcl2="230302338:Y8MCu494BKI"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.23030; __gads=ID=20dba742ff0011d4:T=1714406703:RT=1714485675:S=ALNI_MZz8jZp982rT48QdQdDBjh5a_vi0w; __gpi=UID=00000dff8c542177:T=1714406703:RT=1714485675:S=ALNI_MZxicDjcSAMi_pdkeduQ3otQCutaQ; __eoi=ID=4874846fe6a3305c:T=1714406703:RT=1714485675:S=AA-Afjb2qzoXLCtivOMEIgch2k1e; __utmc=30149280; ck=mJ0K; frodotk_db="f723232f0ba97d93e569698d22fc78b9"; __utmb=30149280.2.10.1714485673; ap_v=0,6.0; __utmt=1',
               }
    response= requests.get(url=url ,headers=headers)
    c_html = response.content
    return c_html

def next_p(c_html,base_url):
    link = None
    html_n = etree.HTML(c_html)
    lu_next = html_n.xpath('//div[@id="paginator"]/a[@class="next"]/@href')
    if lu_next:
        link = base_url + lu_next[0]
   # time.sleep(5)
    return link

def pin(c_html):
    #print(c_html)
    et_html= etree.HTML(c_html)
    lujing = et_html.xpath("/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/p/span")
    return lujing
    '''for i in lujing:
        return i.text'''

def tol():
    base_url = 'https://movie.douban.com/subject/4321270/comments'
    link = base_url
    while link:
        c_html = page(link)
        data=[]
        for j in pin(c_html):
            data.append(j.text)
        print(data)
        link=next_p(c_html,base_url)
    return data

if __name__ == '__main__':
    #df = pd.DataFrame(tol())
    #df.to_csv('output.csv',index = False, encoding='utf-8')
    tol()

豆瓣读书标签

from bs4 import BeautifulSoup
import requests
import re
#import threading
#import want2url
import pandas as pd
from tqdm import tqdm

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T"

class douban_crawler():
    send_headers = {
        "Host": "book.douban.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Connection": "close"
    }

    def __init__(self, url, pages):
        """
        :param url: 爬虫的最初界面,决定了要爬的书籍的类别信息
        :param pages: 要爬取的页数,豆瓣每页20本书的信息,决定了要爬取的数据量
        """
        self.url = url
        self.pages = [20*i for i in range(pages)]
        self.book_class = ""
        self.book_names = []
        self.book_nations = []
        self.book_writers = []
        self.book_scores = []
        self.book_comments = []
        self.book_sites = []
        self.book_pages = []

    def generate_urls(self):
        idx_urls = []
        #正则表达式
        page_key = re.compile(r"(\S*\?)")
        #利用正则表达式匹配出url的必须部分,后面和控制页数的变量进行拼接成索要检索的所有url列表
        #注意利用正则匹配到的返回结果为一个列表,一般需要取出列表中的值进行下面的操作
        page_main = page_key.findall(self.url)[0]
        #“合成”所有url列表,因为豆瓣的规则是每20本书放在一页中,并且用url中的start关键字控制显示的页数
        for i in self.pages:
            g_url = page_main+"start="+str(i)+"&type=T"
            idx_urls.append(g_url)
        return idx_urls

    def open_url(self, url=None):
        #如果不给需要打开的url则自动打开最初始界面(对象初始化给的界面)
        if url == None:
            #对网站发起get请求
            resp = requests.get(self.url,headers=self.send_headers)
            #获取返回信息的文本部分
            resp_text = resp.text
            #利用BS库对文本部分进行html解析,并返回解析后的界面
            soup = BeautifulSoup(resp_text, "html.parser")
            return soup
        else:
            resp = requests.get(url, headers=self.send_headers)
            resp_text = resp.text
            soup = BeautifulSoup(resp_text, "html.parser")
            return soup

    def get_main_info(self, url):
        """
        获取url列表页面能获取主要信息,不打开各个书的独立页面,
        主要信息包括:书的所属类别,作者国家,书名,每本书的索引url,书的作者,书的评分,书的简介,书的页数
        :return: 各个主要信息的存储列表

        """
        #分别为,书类别,国家,作者和简介的正则表示式
        book_class_key = re.compile(": (\D*)")
        book_nation_key = re.compile("\[(\D*?)\]")
        book_writer_key1 = re.compile("^(\D*?)/")
        book_writer_key2 = re.compile("](\D*)$")
        book_comment_key = re.compile(r"<p>(\S*)</p>")
        #创建存储主要信息的列表:因为书名是固定的,一个大页面是一个类别,所以只需要返回一次,不需要列表存储
        book_names = []
        book_pages = []
        book_nations = []
        book_writers = []
        book_comments = []
        book_scores = []
        #对url列表进行遍历并操作
        #urls = self.generate_urls()
        #为了防止耦合,最好一个函数只操作一个页面,在主函数进行对这个函数的遍历操作
        resp = requests.get(url, headers=self.send_headers) #和上面一样的操作,向url发送get请求

        resp_text = resp.text #获取返回的文本信息

        soup = BeautifulSoup(resp_text, "html.parser") #利用BS库对html格式的文本信息进行解析
        # 获取图书类别
        book_class = soup.find("h1").get_text(strip=True)
        book_class = book_class_key.findall(book_class)
        # 获取书名
        for a in soup.find_all("a"):
            try:
                # 获取书名
                res = a.get("title")
                # 获取对应的内层网站
                res_url = a.get("href")
                # 获取每本书对应的独立页面url
                if res != None:
                    book_names.append(res)
                    book_pages.append(res_url)
            except:
                pass

        """
        获取书的作者和作者国籍,因为非中国籍的形式为[国家]作者,而中国籍作者在作者名前没有[]
        所以我们用两个正则表达式分别检索,但是少数作者即使不为中国籍,也没有加[],此类我把这类数据当作脏数据
        为了尽可能的修正这种数据带来的影响,设置判定条件为,没有[]且作者名小于五个字,为中国作者
        """
        for nation in soup.find_all("div", attrs={"class": "pub"}):
            nn = nation.get_text().strip()
            # print(nn)
            book_writer = book_writer_key1.findall(nn)[0]

            if ']' in book_writer:
                book_writers.append(book_writer_key2.findall(book_writer)[0].strip())
            else:
                book_writers.append(book_writer)

            try:
                bn = book_nation_key.findall(nn)
                if bn == [] and len(book_writer) < 5:  #中国籍作者的判定条件
                    book_nations.append("中")
                elif bn != []:
                    # print(bn)
                    book_nations.append(bn[0])
                else:
                    book_nations.append("日")
            except:
                book_nations.append("中")

        #获取书籍简介
        for comment in soup.find_all("div", attrs={"class": "info"}):
            if comment.find_all("p") == []:
                book_comments.append("无简介")
            else:
                book_comments.append(comment.find_all("p")[0].get_text())

        #获取书籍评分
        for score in soup.find_all("span", attrs={"class": "rating_nums"}):
            book_scores.append(score.get_text())

        return book_names, book_pages, book_class*20, book_writers, book_nations, book_comments, book_scores

    def get_page_numbers(self, urls):
        """
        从每个图书的独立页面中获取数据,目前只获取了页数数据
        :param urls: 从get_main_info中生成的图书独立页面url列表
        :return: 对应图书的页数数据
        """
        book_pagesnumber = []
        print("****开始获取页数信息****")
        for url in tqdm(urls):
            rrr = requests.get(url, headers=self.send_headers)
            rtext = rrr.text
            in_soup = BeautifulSoup(rtext, 'html.parser')
            # print(in_soup.text)
            page_num = re.compile(r"页数: (\d*)").findall(in_soup.text)
            #有可能有的书缺失页数信息,遇上此类情况全部将页数设置为0
            if page_num == []:
                book_pagesnumber.append(0)
            else:
                book_pagesnumber.extend(page_num)

        return book_pagesnumber

    def begin_crawl(self):
        """
        类的“主函数”只需要执行这个函数就可以完成爬虫功能
        :return: 所有的信息列表
        """
        sum_book_names = []
        sum_book_urls = []
        sum_book_class = []
        sum_book_writers = []
        sum_book_nations = []
        sum_book_comments = []
        sum_book_scores = []
        sum_book_pages = []
        urls = self.generate_urls() #生成要爬取的所有页面的url地址
        print("****开始爬取****")
        for url in tqdm(urls):
            book_names, book_urls, book_class, book_writers, book_nations, book_comments, book_scores = self.get_main_info(url)
            book_pages = self.get_page_numbers(book_urls)

            sum_book_names.extend(book_names)
            sum_book_urls.extend(book_urls)
            sum_book_class.extend(book_class)
            sum_book_writers.extend(book_writers)
            sum_book_nations.extend(book_nations)
            sum_book_comments.extend(book_comments)
            sum_book_scores.extend(book_scores)
            sum_book_pages.extend(book_pages)

        return sum_book_names, sum_book_urls, sum_book_class, sum_book_writers, sum_book_nations, sum_book_comments, sum_book_scores, sum_book_pages

    def write2csv(self):
        """
        将爬取结果写入csv文件中
        :return: 无返回值
        """
        name, url, book_class, writer, nation, comment, score, pages = self.begin_crawl()
        info_df = pd.DataFrame(columns=["name", "url", "class", "writer", "nation", "comment", "score", "pages"])
        info_df["name"] = name
        info_df["url"] = url
        info_df["class"] = book_class
        info_df["writer"] = writer
        info_df["nation"] = nation
        info_df["comment"] = comment
        info_df["score"] = score
        info_df["pages"] = pages

        info_df.to_csv(f"{book_class[0]}.csv", header=None, encoding="utf_8_sig")


if __name__ == '__main__':
    db_crawler = douban_crawler(url, 5)
    db_crawler.write2csv()




总结

  • 豆瓣爬完了
  • 还能爬点什么呢?🤔
  • 下次再说

  • 本文来自 oeasy Python 系统教程。
  • 想完整、扎实学 Python,
  • 搜索 oeasy 即可。