import datetime
import json
import os
import re
from requests_html import HTMLSession
import requests
import pdfkit
import wechatsogou
confi = pdfkit.configuration(wkhtmltopdf=r"/usr/local/bin/wkhtmltopdf")
ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
class WxMps(object):
def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, _offset=0):
self.offset = _offset
self.biz = _biz # 公众号标志
self.msg_token = _app_msg_token # 票据(非固定)
self.pass_ticket = _pass_ticket # 票据(非固定)
self.headers = {
'cookie': _cookie, # Cookie(非固定)
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) MicroMessenger/6.8.0(0x16080000) MacWechat/3.3.1(0x13030111) Safari/605.1.15 NetType/WIF'
}
def dypdf(self, h1, data, comment):
# 处理后的html
datas = f'''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{h1}</title>
</head>
<body>
<h2 style="text-align: center;font-weight: 400;">{h1}</h2>
{data}
<br>
答案:{comment}
</br>
</body>
</html>
'''
print("开始打印内容!")
pdfkit.from_string(datas, h1, configuration=confi)
print("打印保存成功!")
def _parse_article_detail(self, content_url):
# 从文章页提取相关参数用于获取评论,article_id是已保存的文章id
try:
print(self.headers)
html = requests.get(content_url, headers=self.headers).text
except Exception as e:
print('获取评论失败' + content_url)
else:
# group(0) is current line
str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
str_msg = re.search(r"var appmsgid = \"\" \|\| \'\' \|\| '(.*)'", html)
str_token = re.search(r'window.appmsg_token = "(.*)";', html)
if str_comment and str_msg and str_token:
comment_id = str_comment.group(1) # 评论id(固定)
app_msg_id = str_msg.group(1) # 票据id(非固定)
appmsg_token = str_token.group(1) # 票据token(非固定)
# 缺一不可
if appmsg_token and app_msg_id and comment_id:
print('Crawl article comments: ' + content_url)
return self._crawl_comments(app_msg_id, comment_id, appmsg_token)
def _crawl_comments(self, app_msg_id, comment_id, appmsg_token):
"""抓取文章的评论"""
api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
'&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
'&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
'&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
self.pass_ticket, appmsg_token)
resp = requests.get(api, headers=self.headers).json()
ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
if ret == 0 or status == 'ok':
elected_comment = resp['elected_comment']
return elected_comment[0]['content']
def wx(self, h1, url):
# 该方法根据文章url对html进行处理,使图片显示
content_info = ws_api.get_article_content(url)
# 得到html代码(代码不完整,需要加入head、body等标签)
html_code = content_info['content_html']
comment = self._parse_article_detail(url)
if comment == None:
print('没有评论')
raise
self.dypdf(h1, html_code, comment)
def album_to_url(self, album_id):
if len(album_id) > 0:
res = []
for i in album_id:
url = f'https://mp.weixin.qq.com/mp/appmsgalbum?action=getalbum&__biz=MzI3MDMxNTYwMg==&scene=1&album_id={i}'
session = HTMLSession()
response = session.get(url, headers=self.headers)
name = response.html.xpath('//*[@id="js_tag_name"]/text()')[0]
begin_msgid = \
response.html.xpath('//*[@id="js_content_overlay"]/div[1]/div/div[5]/ul/li[1]/@data-msgid')[0]
link = response.html.xpath('//*[@id="js_content_overlay"]/div[1]/div/div[5]/ul/li[1]/@data-link')[0]
title = response.html.xpath('//*[@id="js_content_overlay"]/div[1]/div/div[5]/ul/li[1]/@data-title')[0]
temp = {'album_id': i, 'begin_msgid': begin_msgid, "name": name, 'title': title, 'link': link}
res.append(temp)
session.close()
return res
def get_url(self, album_id, begin_msgid, title, link):
count = 20
urlList = [{'title': title, 'url': link}]
while (1):
url = f'https://mp.weixin.qq.com/mp/appmsgalbum?action=getalbum&__biz=MzI3MDMxNTYwMg==&album_id={album_id}&count={count}&begin_msgid={begin_msgid}&begin_itemidx=1&f=json'
response = requests.request("GET", url, headers=self.headers)
res = json.loads(response.text)
try:
sum = len(res['getalbum_resp']['article_list'])
except:
break
if (sum == 20):
begin_msgid = res['getalbum_resp']['article_list'][19]['msgid']
urlList.extend(res['getalbum_resp']['article_list'])
print('翻页')
elif sum > 0:
urlList.extend(res['getalbum_resp']['article_list'])
break
return urlList
if __name__ == '__main__':
biz = 'MzI3MDMxNTYwMg=='
pass_ticket = ''
app_msg_token = ''
cookie = ''
album_id = ['1630081855687360514']
wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie)
urls = wxMps.album_to_url(album_id)
if len(urls) < 0:
print('没有可爬取的')
else:
for i in urls:
content_url = wxMps.get_url(i['album_id'], i['begin_msgid'], i['title'], i['link'])
for j in content_url:
print(i['name'], j['title'], j['url'])
if not os.path.exists(f"./{i['name']}"):
os.mkdir(f"./{i['name']}")
wxMps.wx(f"{i['name']}/{j['title']}.pdf", f"https{j['url'].split('http')[1]}")
微信公众号专栏文章采集+首条评论(PDF格式输出)
?著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事?!?“怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 成长记录-连载(三十六) ——我的第一篇五千字长文,说了什么,你一定想不到 并不是不想每天写公众号,而是之前思考怎...
- 前言 背景: 某一天,拿着自己的手机看着技术文章,然而手机看技术文章,有时候确实蛋疼,因为一旦代码多起来,小屏幕看...
- 嗯,今天要介绍的是「小蜜蜂公众号文章助手」(下称助手),经过4、5个月的时间,版本已经到了 2.1.0 了,也从最...