64774

域名过滤

# 域名过滤import requestsimport reimport multiprocessingclass Get_url(object): def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36", } # 读取本地域名列表:包括以m.开头的和以www.开头的 def url_list(self, q): url_list = list() with open('../domain_one/jg.txt', 'r', encoding='utf-8') as f: data = f.readlines() for da in data: q.put(data) """ 1.读取本地的域名 2.逐个进行访问 3.对访问成功的域名进行过滤 生成以m.开头或者www.开头域名 4.保存到本地文件 """ def get_url_list(self, q): while True: url_list = q.get() for url in url_list: ur = "https://" + url try: response = requests.get(url=ur, headers=self.headers, verify=False, timeout=2) response.raise_for_status() if response.status_code == 200: print(ur) # 域名过滤 ur = re.sub('https://', '', ur) # for u in ur: with open('../domain_two/domain.txt', 'a+', encoding='utf-8') as f: f.write(ur) except requests.ConnectTimeout: print('超时!') except requests.HTTPError: print('http状态码非200') except Exception as e: print('未进行容错处理的情况:', e) if q.empty(): break def main(self): q = multiprocessing.Queue() p1 = multiprocessing.Process(target=self.url_list, args=(q,)) p2 = multiprocessing.Process(target=self.get_url_list, args=(q,)) p1.start() p2.start()# 方法的调用if __name__ == '__main__': g = Get_url() for i in range(10): g.main()

来源:博客园

作者:Victor_JJ

链接:https://www.cnblogs.com/victorstudy/p/11425878.html

Recommend