Skip to content

Latest commit

 

History

History
295 lines (253 loc) · 11.4 KB

File metadata and controls

295 lines (253 loc) · 11.4 KB
Error in user YAML: (<unknown>): found a tab character that violate indentation while scanning a plain scalar at line 3 column 3
---
- oeasy Python 0549
- 这是 oeasy 系统化 Python 教程,从基础一步步讲,扎实、完整、不跳步。愿意花时间学,就能真正学会。
- 本教程同步发布在: 
	- 个人网站: `https://oeasy.org` 
	- 蓝桥云课: `https://www.lanqiao.cn/courses/3584` 
	- GitHub: `https://github.com/overmind1980/oeasy-python-tutorial` 
	- Gitee: `https://gitee.com/overmind1980/oeasypython` 
---

新浪微博

微博

查看数据

图片描述

实现接口

  • 从爬取到
    • 情感分析
    • 词云图
    • 散点图
import json
import re
import openpyxl
import requests
from lxml import etree
import csv

cookies = {
    'SUB': '_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1',
    'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7',
    'XSRF-TOKEN': 'ALBZ5zY_YLCch7bfNKBZmnqA',
    'WBPSESS': 'gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8',
    '_s_tentry': 'weibo.com',
    'Apache': '9474795353437.963.1714399695446',
    'SINAGLOBAL': '9474795353437.963.1714399695446',
    'ULV': '1714399695470:1:1:1:9474795353437.963.1714399695446:',
}

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'client-version': 'v2.45.7',
    # 'cookie': 'SUB=_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7; XSRF-TOKEN=ALBZ5zY_YLCch7bfNKBZmnqA; WBPSESS=gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8; _s_tentry=weibo.com; Apache=9474795353437.963.1714399695446; SINAGLOBAL=9474795353437.963.1714399695446; ULV=1714399695470:1:1:1:9474795353437.963.1714399695446:',
    'priority': 'u=1, i',
    'referer': 'https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fwww.weibo.com%2F',
    'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'server-version': 'v2024.04.29.1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
    'x-requested-with': 'XMLHttpRequest',
    'x-xsrf-token': 'ALBZ5zY_YLCch7bfNKBZmnqA',
}

url = requests.get('https://weibo.com/ajax/side/hotSearch', cookies=cookies, headers=headers)
# print(url.text)


ws=[]
ws.append(['顺序','热搜分类','热搜关键词'])
data = json.loads(url.text)['data']['realtime']
for i in data:
    try:
        print(f'热搜:{i["realpos"]}, 热搜分类[{i["category"]}], 热搜关键词:{i["word"]}')
        ws.append([i["realpos"],i["category"],i["word"].replace(' ','')])
    except:
        pass

with open("热搜.csv", 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for i in ws:
        writer.writerow(i)

话题数据采集

# coding=gbk
# -*- coding:uft-8 -*-
# 帖文采集

# 1.导入爬虫相关库
import requests
from lxml import etree
import time
from urllib import parse
import pandas as pd
from fake_useragent import UserAgent


def collect(k):
    resLs = []  # 存储采集结果的列表
    for page in range(50):  # 爬取50页贴文
        time.sleep(3)  # 每次请求后等待3秒
        page += 1  # 页数自增1
        url = f'https://s.weibo.com/weibo?q={parse.quote(k)}&xsort=hot&suball=1&Refer=g&page={page}'  # # 构造请求URL
        print(k, page, '>>>')
        headers = {
            'Cookie': ck,  # 设置请求的Cookie
            'User-Agent': ua,  # 设置请求的User-Agent
            'Referer': url  # 设置Referer头部信息
        }
        while True:
            try:
                res = requests.get(url=url, headers=headers, timeout=(5, 5)).text  # 发送请求并获取响应
                break
            except:
                time.sleep(5)  # 请求超时,等待5秒后重试
        if f'<p>抱歉,未找到“{k}”相关结果。</p>' in res:
            break  # 如果未找到相关结果,则结束采集
        tree = etree.HTML(res)  # 解析HTML
        for li in tree.xpath('//div[@action-type="feed_list_item"]'):  # 遍历每个帖子
            name = li.xpath('.//a[@class="name"]/text()')[0]  # 获取昵称
            uid = li.xpath('.//a[@class="name"]/@href')[0].split('/')[-1].split('?')[0]  # 获取用户ID
            date = li.xpath('.//div[@class="from"]/a/text()')[0].strip()  # 获取时间
            cbox = li.xpath('.//p[@node-type="feed_list_content_full"]')
            cbox = li.xpath('.//p[@node-type="feed_list_content"]')[0] if not cbox else cbox[0]  # 获取内容
            cont = ''.join(cbox.xpath('.//text()')).replace('收起d', '').strip()
            tran = li.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')[1].strip()  # 获取转发数
            try:
                tran = eval(tran)
            except:
                tran = 0
            comm = li.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')[0].strip()  # 获取评论数
            try:
                comm = eval(comm)
            except:
                comm = 0
            like = li.xpath('.//span[@class="woo-like-count"]/text()')[0].strip()  # 获取点赞数
            try:
                like = eval(like)
            except:
                like = 0
            ID = li.xpath('./@mid')[0]  # 获取帖子ID
            dic = {
                '昵称': name,
                '用户': uid,
                '时间': date,
                '内容': cont,
                '转发': tran,
                '评论': comm,
                '点赞': like,
                'ID': ID
            }
            resLs.append(dic)  # 将采集到的帖子信息添加到结果列表
            print(dic)  # 打印帖子信息
        df = pd.DataFrame(resLs)  # 将结果列表转换为DataFrame
        df.to_excel('北大宿舍聊天(50页1).xlsx', index=False)  # 将结果保存为Excel文件


if __name__ == '__main__':
    # 2.填入正确却有效的cookies
    ck = 'your cookie'
    # 3.设置随机User-Agent
    ua = UserAgent().chrome
    # 4.调用并贴名 启动爬虫
    collect('北大宿舍聊天')

图片描述

爬取主题

import time
import random
import requests
from lxml import etree
import csv
import re
import time
from urllib import parse
import pandas as pd

f = open('花西子9.10.csv', 'a', encoding='utf-8_sig', newline='')  # 用于保存数据
csv_write = csv.writer(f)
csv_write.writerow(["关键词", "用户ID", "用户名称", "转发数量", "获赞数量", "评论数量", "时间", "文章"])

n = 1
keys = '花西子眉笔'

# 在写入之前存储已经写入的用户ID,避免重复写入
written_ids = set()



for i in range(1, 50):

    cookies = {
        'SINAGLOBAL': '1848242080404.5386.1711895292295',
        'SCF': 'AiOLF-4l3JRNN3AGu3BOLRvUqWMIUa7rNzjTB4X3jpMSNZoZfPNhrNYd9mQVDRNb9oeakANLSKph_5Vye9eDa5A.',
        'ALF': '1723209315',
        'SUB': '_2A25LivkyDeRhGeFG41QY8C_Pyj-IHXVo5nT6rDV8PUJbkNANLXPWkW1NeIvLJoJHGJxDa4UWavydjjE-FLxGBBAu',
        'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WF1YLf8.eNv5ZguCsp4i0Tc5JpX5KMhUgL.FoMR1hq4eh20eKe2dJLoIERLxKBLBonL1h5LxK-L12qLB-2LxK-LBo5L12qLxKML1hnLB-eLxKnL1K.LB-i0',
        '_s_tentry': '-',
        'Apache': '773223686730.8506.1720618573190',
        'ULV': '1720618573196:6:1:1:773223686730.8506.1720618573190:1717996997784',
    }

    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        # Requests sorts cookies= alphabetically
        # 'cookie': 'SINAGLOBAL=1848242080404.5386.1711895292295; SCF=AiOLF-4l3JRNN3AGu3BOLRvUqWMIUa7rNzjTB4X3jpMSNZoZfPNhrNYd9mQVDRNb9oeakANLSKph_5Vye9eDa5A.; ALF=1723209315; SUB=_2A25LivkyDeRhGeFG41QY8C_Pyj-IHXVo5nT6rDV8PUJbkNANLXPWkW1NeIvLJoJHGJxDa4UWavydjjE-FLxGBBAu; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF1YLf8.eNv5ZguCsp4i0Tc5JpX5KMhUgL.FoMR1hq4eh20eKe2dJLoIERLxKBLBonL1h5LxK-L12qLB-2LxK-LBo5L12qLxKML1hnLB-eLxKnL1K.LB-i0; _s_tentry=-; Apache=773223686730.8506.1720618573190; ULV=1720618573196:6:1:1:773223686730.8506.1720618573190:1717996997784',
        'priority': 'u=0, i',
        'referer': 'https://s.weibo.com/weibo?q=%E8%8A%B1%E8%A5%BF%E5%AD%90',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
    }

    params = {
        'q': keys,
        'page': i,
    }

    response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)

    # time.sleep(random.uniform(4.4, 9.6))
    data = etree.HTML(response.content.decode())
    div_list = data.xpath('//div[@id="pl_feedlist_index"]/div[2]/div')
    for div in div_list:
        try:
            href = div.xpath('.//div[@class="info"]/div[2]/a/@href')[0]
        except:
            print("111111111111")
            continue
        id = href.split('?')[0].replace('//weibo.com/', '')
        name = div.xpath('.//div[@class="info"]/div[2]/a/text()')[0]

        # 检查用户ID是否已经写入,如果已经写入,则跳过
        if id in written_ids:
            continue

        forward = div.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')[1].replace(' ', '')
        if forward == '转发':
            forward = 0
        else:
            forward = forward
        like = div.xpath('.//div[@class="card-act"]/ul/li[3]/a//span[@class="woo-like-count"]/text()')[0]
        if like == '赞':
            like = 0
        else:
            like = like
        comment = div.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')[0].replace(' ', '')
        if comment == '评论':
            comment = 0
        else:
            comment = comment
        date = div.xpath('.//div[@class="from"]/a[1]/text()')
        date = [x.strip() for x in date if x.strip() != '']
        date = date[0]
        text = div.xpath('.//p/text()')
        text = [x.strip() for x in text if x.strip() != '']
        text = ''.join(text).replace(' \u200b', '')
        csv_write.writerow([keys, id, name, forward, like, comment, date, text])

        # 将已经写入的用户ID加入集合
        written_ids.add(id)

        print({"关键词": keys, "用户ID": id, "用户名称": name, "转发数量": forward, "获赞数量": like, "评论数量": comment, "时间": date, "文章内容": text})
        n = n + 1
    print(f'爬取了{i}页' + str(n) + "条")

# 关闭文件
f.close()

总结

  • 了解元素的标签成员
    • tag
  • etree.Element最重要的是
    • 构成一棵家族树
  • 下次再说

  • 本文来自 oeasy Python 系统教程。
  • 想完整、扎实学 Python,
  • 搜索 oeasy 即可。