爬取github的关键字搜索结果

2022-06-13 python►爬虫评论

背景

功能：根据github搜索关键字得到的结果进行解析，提取需要的字段，整个过程均自动化。
思路：

在页面上模拟操作，此过程中使用抓包工具Fiddler分析相关反爬策略
可先获取到结果页面，进行离线解析，暂不依赖网络环境，提供工作效率
补充完善抓取策略，比如并发，代理池。后续思考使用框架是不是效率更高

注：github官方有提供APIserarch-code，但是其不能在全部的公共库中搜索，故使用爬虫爬取搜索数据，官方有提供API优先考虑能否从API中获取到的需要的数据。

安装依赖

依赖的模块有requests，beautifulsoup4以及解析器lxml。

pip install requests

pip install beautifulsoup4

pip install lxml

登录认证

首先在页面上模拟搜索关键字结果，发现type为code类型的需要登录认证，然后在页面上登录，并通过Fiddler抓包发现发现最终提交数据的url是session，且需要提交下图所示的数据，经测试提交关键信息即可。

commit: Sign in
authenticity_token: your token
login: your username
password: your password

上面所需要提交的数据发现authenticity_token不知道从哪里来，因为请求的url是login，而实际提交的
url是session，很有可能是在请求登录页的时候动态生成的，去页面上搜索一下token果然有该参数。

至此，前路基本已经铺平，只需要进行实现即可，这里注意使用requests包中的session对象，该对象能够保持会话，就不用重复登录了。

具体实现

个人认为主要分为2个部分：

获取到结果前：包括登录认证，获取所有请求链接等
获取到结果后：包括页面的解析，字段提取等

这里贴出多线程版本的代码，虽然优化空间还有很大，但也基本能运行：

import json
import re
from queue import Queue
from threading import Thread
from urllib import parse

import requests
import urllib3
from bs4 import BeautifulSoup

urllib3.disable_warnings()

USERNAME = ""
PASSWD = ""
KEYWORD = ""
PROXY = {}  # https://free.kuaidaili.com/free/inha/


class GithubCrawl:
    def __init__(self, username, passwd, keyword):
        self.username = username
        self.passwd = passwd

        self.queue = Queue()
        self.session = requests.Session()
        self.result = []

        self.threads = 5
        self.output_file = "./temp.txt"
        self.login_url = "https://github.com/login"
        self.post_url = "https://github.com/session"
        self.search_url = f"https://github.com/search?q={keyword}&type=code"
        self.headers = {
            "Referer": "https://github.com/",
            "Host": "github.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
        }
        self.proxy = PROXY

    @staticmethod
    def _parse_content(tags):
        content = ""
        for tag in tags:
            content += tag.text
        return content

    def _get_token(self):
        resp = self.session.get(self.login_url, headers=self.headers, verify=False, proxies=self.proxy)
        soup = BeautifulSoup(resp.text, "lxml")
        token = soup.find("input", attrs={"name": "authenticity_token"}).get("value")
        print(f"token is: {token}")
        return token

    def login(self, token):
        post_data = {
            'commit': 'Sign in',
            'login': self.username,
            'password': self.passwd,
            'authenticity_token': token
        }
        resp = self.session.post(self.post_url, data=post_data, headers=self.headers, verify=False, proxies=self.proxy)
        if resp.status_code == 200:
            print("successful set up a session on github...")
            self.get_urls()

    def get_urls(self):
        resp = self.session.get(self.search_url, headers=self.headers, verify=False, proxies=self.proxy)
        soup = BeautifulSoup(resp.text, "lxml")
        if re.search("login", soup.title.text, re.I):
            raise ConnectionError("the session is closed, please check network or add proxy!")
        total_pages = soup.find(attrs={"aria-label": "Pagination"}).text.split(" ")[-2]
        for i in range(1, int(total_pages) + 1):
            _url = self.search_url + f"&p={i}"
            print(f"add the url to queue: {_url}")
            self.queue.put(_url)

    def get_data(self):
        while True:
            if self.queue.empty():
                break
            url = self.queue.get()
            print(f"get url: {url}")
            self.parse_search_page(url)

    def parse_search_page(self, url):
            resp = self.session.get(url, headers=self.headers, verify=False, proxies=self.proxy)
            soup = BeautifulSoup(resp.text, "lxml")
            items = soup.find_all(class_="code-list-item")
            if not items:
                print(f"not found data in the page {url}...")
                return
            print(f"start parse url: {url}")
            for item in items:
                text_small = item.find(class_="text-small").text.strip().split("/")
                lang = item.find(attrs={"itemprop": "programmingLanguage"})
                data = {
                    "author_favicon": item.find("img").attrs["src"],
                    "author": text_small[0].strip(),
                    "repository": text_small[1].strip(),
                    "filename": item.find(class_="text-normal").text.strip(),
                    "filepath": parse.urljoin("https://github.com", item.find(class_="text-normal").a.attrs["href"]),
                    "content": self._parse_content(item.find_all(class_="blob-code")),
                    "language": lang.text if lang else lang,
                    "updated_at": item.find(class_="updated-at").find(class_="no-wrap").attrs["datetime"]
                }
                print(data)
                self.result.append(json.dumps(data))

    def write_to_file(self):
        try:
            with open(self.output_file, "w", encoding="utf-8") as f:
                f.writelines([line + "\n" for line in self.result])
            print("finished...")
        except Exception as e:
            print("write result to file failed...")
            raise e

    def start(self):
        token = self._get_token()
        self.login(token)
        t_list = []
        for i in range(self.threads):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()
        for t in t_list:
            t.join()
        print("all task finished...")
        self.write_to_file()


def main():
    crawler = GithubCrawl(USERNAME, PASSWD, KEYWORD)
    crawler.start()


if __name__ == '__main__':
    main()

后续追加

本以为到此大功告成，没成想交给同事运行测试的时候发现报错了，经个人简单排查后发现github对新设备登录有认证要求，需要提供邮件的验证码，故目前还得将自动化获取验证码这一步添加上去。

继续使用Fiddler进行抓包分析，发现验证设备提交的url是/sessions/verified-device，并需要提交下列数据。

1 2	authentictiy_token: your token opt: your verification code

注：这里的authentictiy_token不是之前/login页面的，而是在/sessions/verified-device页面中新生成的

最后，这里简单给出通过pop3协议获取邮箱验证码的代码，代码中确保了获取到的是在请求github页面后收到的验证码邮件，此外，还需在修改下上述github的login方法，完整代码见git。

# email info
EMAIL_ACCOUNT = ""
AUTH_CODE = ""  # must authorization code
POP3_SSL_SERVER = ""
PROTOCOL = "pop3"


class EmailReceiver:
    def __init__(self, email_account, auth_code, pop3_ssl_server, send_login_time, protocol="pop3"):
        self.email_account = email_account
        self.auth_code = auth_code
        self.pop3_ssl_server = pop3_ssl_server
        self.send_login_time = send_login_time

        self.email_total_number = None

        self.session = self.login_pop3() if protocol == "pop3" else None
        if not self.session:
            raise ConnectionError("[Email Receiver] failed connect to the email server")

    def login_pop3(self):
        a = poplib.POP3_SSL(self.pop3_ssl_server)
        a.user(self.email_account)
        a.pass_(self.auth_code)
        resp, mails, octets = a.list()
        self.email_total_number = len(mails)
        print(f"[Email Receiver] the number of email is: {self.email_total_number}")
        return a if resp.decode("utf-8") == "+OK" else None

    def logout(self):
        if self.session:
            self.session.quit()

    @staticmethod
    def decode_str(s):
        value, charset = decode_header(s)[0]
        if charset:
            value = value.decode(charset)
        return value

    @staticmethod
    def _is_latest_email(content, send_login_time) -> bool:
        date = content.get("Received", "")
        ret = re.search(r"(?:\d+:){2,}?\d+", date)
        if not ret:
            print(f"[Email Receiver] get the latest email recv time failed, date: {date}")
            return False
        recv_email_time = ret.group()
        time_diff = datetime.strptime(recv_email_time, "%H:%M:%S") - datetime.strptime(send_login_time, "%H:%M:%S")
        print(f"recv: {recv_email_time}, login: {send_login_time}, {time_diff}")
        if time_diff.days < 0:
            print("[Email Receiver] the latest email received was not after logged into github")
        return time_diff.days == 0

    def _is_github_verify_email(self, content) -> bool:
        subject = self.decode_str(content.get("Subject", ""))
        from_ = self.decode_str(content.get("From", ""))
        ret = re.search(r"\[GitHub] Please verify your device", subject)
        if not ret:
            print(f"[Email Receiver] the latest email is not from github, subject: {subject}, From: {from_}")
            return False
        return True

    @staticmethod
    def get_email_content(session, total_number):
        print(f"[Email Receiver] curr total_number is: {total_number}")
        resp, lines, octets = session.retr(total_number)
        msg_content = b"\r\n".join(lines).decode("utf-8", "ignore")
        content = Parser().parsestr(msg_content)
        return content

    def get_verification_code(self):
        content = self.get_email_content(self.session, self.email_total_number)
        flag = False
        for index in range(5):
            if self._is_github_verify_email(content) and self._is_latest_email(content, self.send_login_time):
                flag = True
                break
            time.sleep(6)

            temp_session = poplib.POP3_SSL(self.pop3_ssl_server)
            temp_session.user(self.email_account)
            temp_session.pass_(self.auth_code)
            emails, _ = temp_session.stat()
            if emails > self.email_total_number:
                content = self.get_email_content(temp_session, emails)
            temp_session.quit()

        verification_code = re.search(r"Verification code: (\d+)", str(content))
        if flag and verification_code:
            return verification_code.groups()[0]
        print("[Email Receiver] get github verification code failed, try 5 times")

注：经个人测试，使用poplib在同一次连接中无法实时获取最新邮件，目前的解决方法是重新建立了临时的连接进行获取。后续发现有python的第三方模块zmail支持实时获取，简单研究了下，详见后续文章。

本文链接： https://cezz-rm.github.io/2022/06/13/爬虫/爬取github的关键字搜索结果/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

HAHADeveloper & Designer

个人简介