Error in user YAML: (<unknown>): found a tab character that violate indentation while scanning a plain scalar at line 3 column 3
---
- oeasy Python 0549
- 这是 oeasy 系统化 Python 教程,从基础一步步讲,扎实、完整、不跳步。愿意花时间学,就能真正学会。
- 本教程同步发布在:
- 个人网站: `https://oeasy.org`
- 蓝桥云课: `https://www.lanqiao.cn/courses/3584`
- GitHub: `https://github.com/overmind1980/oeasy-python-tutorial`
- Gitee: `https://gitee.com/overmind1980/oeasypython`
---- 从爬取到
- 情感分析
- 词云图
- 散点图
import json
import re
import openpyxl
import requests
from lxml import etree
import csv
cookies = {
'SUB': '_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1',
'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7',
'XSRF-TOKEN': 'ALBZ5zY_YLCch7bfNKBZmnqA',
'WBPSESS': 'gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8',
'_s_tentry': 'weibo.com',
'Apache': '9474795353437.963.1714399695446',
'SINAGLOBAL': '9474795353437.963.1714399695446',
'ULV': '1714399695470:1:1:1:9474795353437.963.1714399695446:',
}
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'client-version': 'v2.45.7',
# 'cookie': 'SUB=_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7; XSRF-TOKEN=ALBZ5zY_YLCch7bfNKBZmnqA; WBPSESS=gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8; _s_tentry=weibo.com; Apache=9474795353437.963.1714399695446; SINAGLOBAL=9474795353437.963.1714399695446; ULV=1714399695470:1:1:1:9474795353437.963.1714399695446:',
'priority': 'u=1, i',
'referer': 'https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fwww.weibo.com%2F',
'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2024.04.29.1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': 'ALBZ5zY_YLCch7bfNKBZmnqA',
}
url = requests.get('https://weibo.com/ajax/side/hotSearch', cookies=cookies, headers=headers)
# print(url.text)
ws=[]
ws.append(['顺序','热搜分类','热搜关键词'])
data = json.loads(url.text)['data']['realtime']
for i in data:
try:
print(f'热搜:{i["realpos"]}, 热搜分类[{i["category"]}], 热搜关键词:{i["word"]}')
ws.append([i["realpos"],i["category"],i["word"].replace(' ','')])
except:
pass
with open("热搜.csv", 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for i in ws:
writer.writerow(i)
# coding=gbk
# -*- coding:uft-8 -*-
# 帖文采集
# 1.导入爬虫相关库
import requests
from lxml import etree
import time
from urllib import parse
import pandas as pd
from fake_useragent import UserAgent
def collect(k):
resLs = [] # 存储采集结果的列表
for page in range(50): # 爬取50页贴文
time.sleep(3) # 每次请求后等待3秒
page += 1 # 页数自增1
url = f'https://s.weibo.com/weibo?q={parse.quote(k)}&xsort=hot&suball=1&Refer=g&page={page}' # # 构造请求URL
print(k, page, '>>>')
headers = {
'Cookie': ck, # 设置请求的Cookie
'User-Agent': ua, # 设置请求的User-Agent
'Referer': url # 设置Referer头部信息
}
while True:
try:
res = requests.get(url=url, headers=headers, timeout=(5, 5)).text # 发送请求并获取响应
break
except:
time.sleep(5) # 请求超时,等待5秒后重试
if f'<p>抱歉,未找到“{k}”相关结果。</p>' in res:
break # 如果未找到相关结果,则结束采集
tree = etree.HTML(res) # 解析HTML
for li in tree.xpath('//div[@action-type="feed_list_item"]'): # 遍历每个帖子
name = li.xpath('.//a[@class="name"]/text()')[0] # 获取昵称
uid = li.xpath('.//a[@class="name"]/@href')[0].split('/')[-1].split('?')[0] # 获取用户ID
date = li.xpath('.//div[@class="from"]/a/text()')[0].strip() # 获取时间
cbox = li.xpath('.//p[@node-type="feed_list_content_full"]')
cbox = li.xpath('.//p[@node-type="feed_list_content"]')[0] if not cbox else cbox[0] # 获取内容
cont = ''.join(cbox.xpath('.//text()')).replace('收起d', '').strip()
tran = li.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')[1].strip() # 获取转发数
try:
tran = eval(tran)
except:
tran = 0
comm = li.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')[0].strip() # 获取评论数
try:
comm = eval(comm)
except:
comm = 0
like = li.xpath('.//span[@class="woo-like-count"]/text()')[0].strip() # 获取点赞数
try:
like = eval(like)
except:
like = 0
ID = li.xpath('./@mid')[0] # 获取帖子ID
dic = {
'昵称': name,
'用户': uid,
'时间': date,
'内容': cont,
'转发': tran,
'评论': comm,
'点赞': like,
'ID': ID
}
resLs.append(dic) # 将采集到的帖子信息添加到结果列表
print(dic) # 打印帖子信息
df = pd.DataFrame(resLs) # 将结果列表转换为DataFrame
df.to_excel('北大宿舍聊天(50页1).xlsx', index=False) # 将结果保存为Excel文件
if __name__ == '__main__':
# 2.填入正确却有效的cookies
ck = 'your cookie'
# 3.设置随机User-Agent
ua = UserAgent().chrome
# 4.调用并贴名 启动爬虫
collect('北大宿舍聊天')
import time
import random
import requests
from lxml import etree
import csv
import re
import time
from urllib import parse
import pandas as pd
f = open('花西子9.10.csv', 'a', encoding='utf-8_sig', newline='') # 用于保存数据
csv_write = csv.writer(f)
csv_write.writerow(["关键词", "用户ID", "用户名称", "转发数量", "获赞数量", "评论数量", "时间", "文章"])
n = 1
keys = '花西子眉笔'
# 在写入之前存储已经写入的用户ID,避免重复写入
written_ids = set()
for i in range(1, 50):
cookies = {
'SINAGLOBAL': '1848242080404.5386.1711895292295',
'SCF': 'AiOLF-4l3JRNN3AGu3BOLRvUqWMIUa7rNzjTB4X3jpMSNZoZfPNhrNYd9mQVDRNb9oeakANLSKph_5Vye9eDa5A.',
'ALF': '1723209315',
'SUB': '_2A25LivkyDeRhGeFG41QY8C_Pyj-IHXVo5nT6rDV8PUJbkNANLXPWkW1NeIvLJoJHGJxDa4UWavydjjE-FLxGBBAu',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WF1YLf8.eNv5ZguCsp4i0Tc5JpX5KMhUgL.FoMR1hq4eh20eKe2dJLoIERLxKBLBonL1h5LxK-L12qLB-2LxK-LBo5L12qLxKML1hnLB-eLxKnL1K.LB-i0',
'_s_tentry': '-',
'Apache': '773223686730.8506.1720618573190',
'ULV': '1720618573196:6:1:1:773223686730.8506.1720618573190:1717996997784',
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# Requests sorts cookies= alphabetically
# 'cookie': 'SINAGLOBAL=1848242080404.5386.1711895292295; SCF=AiOLF-4l3JRNN3AGu3BOLRvUqWMIUa7rNzjTB4X3jpMSNZoZfPNhrNYd9mQVDRNb9oeakANLSKph_5Vye9eDa5A.; ALF=1723209315; SUB=_2A25LivkyDeRhGeFG41QY8C_Pyj-IHXVo5nT6rDV8PUJbkNANLXPWkW1NeIvLJoJHGJxDa4UWavydjjE-FLxGBBAu; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF1YLf8.eNv5ZguCsp4i0Tc5JpX5KMhUgL.FoMR1hq4eh20eKe2dJLoIERLxKBLBonL1h5LxK-L12qLB-2LxK-LBo5L12qLxKML1hnLB-eLxKnL1K.LB-i0; _s_tentry=-; Apache=773223686730.8506.1720618573190; ULV=1720618573196:6:1:1:773223686730.8506.1720618573190:1717996997784',
'priority': 'u=0, i',
'referer': 'https://s.weibo.com/weibo?q=%E8%8A%B1%E8%A5%BF%E5%AD%90',
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
}
params = {
'q': keys,
'page': i,
}
response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)
# time.sleep(random.uniform(4.4, 9.6))
data = etree.HTML(response.content.decode())
div_list = data.xpath('//div[@id="pl_feedlist_index"]/div[2]/div')
for div in div_list:
try:
href = div.xpath('.//div[@class="info"]/div[2]/a/@href')[0]
except:
print("111111111111")
continue
id = href.split('?')[0].replace('//weibo.com/', '')
name = div.xpath('.//div[@class="info"]/div[2]/a/text()')[0]
# 检查用户ID是否已经写入,如果已经写入,则跳过
if id in written_ids:
continue
forward = div.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')[1].replace(' ', '')
if forward == '转发':
forward = 0
else:
forward = forward
like = div.xpath('.//div[@class="card-act"]/ul/li[3]/a//span[@class="woo-like-count"]/text()')[0]
if like == '赞':
like = 0
else:
like = like
comment = div.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')[0].replace(' ', '')
if comment == '评论':
comment = 0
else:
comment = comment
date = div.xpath('.//div[@class="from"]/a[1]/text()')
date = [x.strip() for x in date if x.strip() != '']
date = date[0]
text = div.xpath('.//p/text()')
text = [x.strip() for x in text if x.strip() != '']
text = ''.join(text).replace(' \u200b', '')
csv_write.writerow([keys, id, name, forward, like, comment, date, text])
# 将已经写入的用户ID加入集合
written_ids.add(id)
print({"关键词": keys, "用户ID": id, "用户名称": name, "转发数量": forward, "获赞数量": like, "评论数量": comment, "时间": date, "文章内容": text})
n = n + 1
print(f'爬取了{i}页' + str(n) + "条")
# 关闭文件
f.close()
- 了解元素的标签成员
- tag
- etree.Element最重要的是
- 构成一棵家族树
- 下次再说
- 本文来自 oeasy Python 系统教程。
- 想完整、扎实学 Python,
- 搜索 oeasy 即可。

