爬取github的关键字搜索结果

背景

功能:根据github搜索关键字得到的结果进行解析,提取需要的字段,整个过程均自动化。
思路:

  • 在页面上模拟操作,此过程中使用抓包工具Fiddler分析相关反爬策略
  • 可先获取到结果页面,进行离线解析,暂不依赖网络环境,提供工作效率
  • 补充完善抓取策略,比如并发,代理池。后续思考使用框架是不是效率更高

    注:github官方有提供APIserarch-code,但是其不能在全部的公共库中搜索,故使用爬虫爬取搜索数据,官方有提供API优先考虑能否从API中获取到的需要的数据。

安装依赖

依赖的模块有requestsbeautifulsoup4以及解析器lxml

1
2
3
4
5
pip install requests

pip install beautifulsoup4

pip install lxml

登录认证

首先在页面上模拟搜索关键字结果,发现type为code类型的需要登录认证,然后在页面上登录,并通过Fiddler抓包发现发现最终提交数据的url是session,且需要提交下图所示的数据,经测试提交关键信息即可。

1
2
3
4
commit: Sign in
authenticity_token: your token
login: your username
password: your password

上面所需要提交的数据发现authenticity_token不知道从哪里来,因为请求的url是login,而实际提交的
url是session,很有可能是在请求登录页的时候动态生成的,去页面上搜索一下token果然有该参数。

至此,前路基本已经铺平,只需要进行实现即可,这里注意使用requests包中的session对象,该对象能够保持会话,就不用重复登录了。

具体实现

个人认为主要分为2个部分:

  • 获取到结果前:包括登录认证,获取所有请求链接等
  • 获取到结果后:包括页面的解析,字段提取等

这里贴出多线程版本的代码,虽然优化空间还有很大,但也基本能运行:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
import re
from queue import Queue
from threading import Thread
from urllib import parse

import requests
import urllib3
from bs4 import BeautifulSoup

urllib3.disable_warnings()

USERNAME = ""
PASSWD = ""
KEYWORD = ""
PROXY = {} # https://free.kuaidaili.com/free/inha/


class GithubCrawl:
def __init__(self, username, passwd, keyword):
self.username = username
self.passwd = passwd

self.queue = Queue()
self.session = requests.Session()
self.result = []

self.threads = 5
self.output_file = "./temp.txt"
self.login_url = "https://github.com/login"
self.post_url = "https://github.com/session"
self.search_url = f"https://github.com/search?q={keyword}&type=code"
self.headers = {
"Referer": "https://github.com/",
"Host": "github.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
}
self.proxy = PROXY

@staticmethod
def _parse_content(tags):
content = ""
for tag in tags:
content += tag.text
return content

def _get_token(self):
resp = self.session.get(self.login_url, headers=self.headers, verify=False, proxies=self.proxy)
soup = BeautifulSoup(resp.text, "lxml")
token = soup.find("input", attrs={"name": "authenticity_token"}).get("value")
print(f"token is: {token}")
return token

def login(self, token):
post_data = {
'commit': 'Sign in',
'login': self.username,
'password': self.passwd,
'authenticity_token': token
}
resp = self.session.post(self.post_url, data=post_data, headers=self.headers, verify=False, proxies=self.proxy)
if resp.status_code == 200:
print("successful set up a session on github...")
self.get_urls()

def get_urls(self):
resp = self.session.get(self.search_url, headers=self.headers, verify=False, proxies=self.proxy)
soup = BeautifulSoup(resp.text, "lxml")
if re.search("login", soup.title.text, re.I):
raise ConnectionError("the session is closed, please check network or add proxy!")
total_pages = soup.find(attrs={"aria-label": "Pagination"}).text.split(" ")[-2]
for i in range(1, int(total_pages) + 1):
_url = self.search_url + f"&p={i}"
print(f"add the url to queue: {_url}")
self.queue.put(_url)

def get_data(self):
while True:
if self.queue.empty():
break
url = self.queue.get()
print(f"get url: {url}")
self.parse_search_page(url)

def parse_search_page(self, url):
resp = self.session.get(url, headers=self.headers, verify=False, proxies=self.proxy)
soup = BeautifulSoup(resp.text, "lxml")
items = soup.find_all(class_="code-list-item")
if not items:
print(f"not found data in the page {url}...")
return
print(f"start parse url: {url}")
for item in items:
text_small = item.find(class_="text-small").text.strip().split("/")
lang = item.find(attrs={"itemprop": "programmingLanguage"})
data = {
"author_favicon": item.find("img").attrs["src"],
"author": text_small[0].strip(),
"repository": text_small[1].strip(),
"filename": item.find(class_="text-normal").text.strip(),
"filepath": parse.urljoin("https://github.com", item.find(class_="text-normal").a.attrs["href"]),
"content": self._parse_content(item.find_all(class_="blob-code")),
"language": lang.text if lang else lang,
"updated_at": item.find(class_="updated-at").find(class_="no-wrap").attrs["datetime"]
}
print(data)
self.result.append(json.dumps(data))

def write_to_file(self):
try:
with open(self.output_file, "w", encoding="utf-8") as f:
f.writelines([line + "\n" for line in self.result])
print("finished...")
except Exception as e:
print("write result to file failed...")
raise e

def start(self):
token = self._get_token()
self.login(token)
t_list = []
for i in range(self.threads):
t = Thread(target=self.get_data)
t_list.append(t)
t.start()
for t in t_list:
t.join()
print("all task finished...")
self.write_to_file()


def main():
crawler = GithubCrawl(USERNAME, PASSWD, KEYWORD)
crawler.start()


if __name__ == '__main__':
main()

后续追加

本以为到此大功告成,没成想交给同事运行测试的时候发现报错了,经个人简单排查后发现github对新设备登录有认证要求,需要提供邮件的验证码,故目前还得将自动化获取验证码这一步添加上去。

继续使用Fiddler进行抓包分析,发现验证设备提交的url是/sessions/verified-device,并需要提交下列数据。

1
2
authentictiy_token: your token
opt: your verification code

注:这里的authentictiy_token不是之前/login页面的,而是在/sessions/verified-device页面中新生成的

最后,这里简单给出通过pop3协议获取邮箱验证码的代码,代码中确保了获取到的是在请求github页面后收到的验证码邮件,此外,还需在修改下上述github的login方法,完整代码见git

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# email info
EMAIL_ACCOUNT = ""
AUTH_CODE = "" # must authorization code
POP3_SSL_SERVER = ""
PROTOCOL = "pop3"


class EmailReceiver:
def __init__(self, email_account, auth_code, pop3_ssl_server, send_login_time, protocol="pop3"):
self.email_account = email_account
self.auth_code = auth_code
self.pop3_ssl_server = pop3_ssl_server
self.send_login_time = send_login_time

self.email_total_number = None

self.session = self.login_pop3() if protocol == "pop3" else None
if not self.session:
raise ConnectionError("[Email Receiver] failed connect to the email server")

def login_pop3(self):
a = poplib.POP3_SSL(self.pop3_ssl_server)
a.user(self.email_account)
a.pass_(self.auth_code)
resp, mails, octets = a.list()
self.email_total_number = len(mails)
print(f"[Email Receiver] the number of email is: {self.email_total_number}")
return a if resp.decode("utf-8") == "+OK" else None

def logout(self):
if self.session:
self.session.quit()

@staticmethod
def decode_str(s):
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value

@staticmethod
def _is_latest_email(content, send_login_time) -> bool:
date = content.get("Received", "")
ret = re.search(r"(?:\d+:){2,}?\d+", date)
if not ret:
print(f"[Email Receiver] get the latest email recv time failed, date: {date}")
return False
recv_email_time = ret.group()
time_diff = datetime.strptime(recv_email_time, "%H:%M:%S") - datetime.strptime(send_login_time, "%H:%M:%S")
print(f"recv: {recv_email_time}, login: {send_login_time}, {time_diff}")
if time_diff.days < 0:
print("[Email Receiver] the latest email received was not after logged into github")
return time_diff.days == 0

def _is_github_verify_email(self, content) -> bool:
subject = self.decode_str(content.get("Subject", ""))
from_ = self.decode_str(content.get("From", ""))
ret = re.search(r"\[GitHub] Please verify your device", subject)
if not ret:
print(f"[Email Receiver] the latest email is not from github, subject: {subject}, From: {from_}")
return False
return True

@staticmethod
def get_email_content(session, total_number):
print(f"[Email Receiver] curr total_number is: {total_number}")
resp, lines, octets = session.retr(total_number)
msg_content = b"\r\n".join(lines).decode("utf-8", "ignore")
content = Parser().parsestr(msg_content)
return content

def get_verification_code(self):
content = self.get_email_content(self.session, self.email_total_number)
flag = False
for index in range(5):
if self._is_github_verify_email(content) and self._is_latest_email(content, self.send_login_time):
flag = True
break
time.sleep(6)

temp_session = poplib.POP3_SSL(self.pop3_ssl_server)
temp_session.user(self.email_account)
temp_session.pass_(self.auth_code)
emails, _ = temp_session.stat()
if emails > self.email_total_number:
content = self.get_email_content(temp_session, emails)
temp_session.quit()

verification_code = re.search(r"Verification code: (\d+)", str(content))
if flag and verification_code:
return verification_code.groups()[0]
print("[Email Receiver] get github verification code failed, try 5 times")

注:经个人测试,使用poplib在同一次连接中无法实时获取最新邮件,目前的解决方法是重新建立了临时的连接进行获取。后续发现有python的第三方模块zmail支持实时获取,简单研究了下,详见后续文章。