爬虫起点中文网
# -*- coding: utf-8 -*- import time import datetime import threadpool from bs4 import BeautifulSoup import csv import requests from urllib.parse import urlencode from retry import retry def get_unix_time(): # 获取unix时间戳 dtime = datetime.datetime.now() ans_time = int(time.mktime(dtime.timetuple())) return ans_time def init(): row = ['book_name', 'author', 'words_count', 'click_count', 'books_count', 'score', 'j_user_count','crawl_time','id'] #row = ['小说名', '作者', '字数', '点击量', '作品个数', '评分', '评价人数', '抓取时间', 'url'] with open("qidian.csv", "w", newline="") as f: f = csv.writer(f, dialect="excel") f.writerow(row) def work(url, count=0): page = requests.get(url) page.encoding = "utf-8" soup = BeautifulSoup(page.text, 'lxml') try: # 选择元素 elem = soup.select(".book-info h1 em") book_name = elem[0].text author = soup.select(".writer")[0].text words_count = soup.select(".book-info p em")[0].text click_count = soup.select(".book-info p em")[1].text books_count = soup.select(".work-state li em")[0].text id = url.replace("https://book.qidian.com/info/", "") crawl_time=get_unix_time() print(url) # score = soup.select("#score1")[0].text + '.' + soup.select("#score2")[0].text # j_user_count = soup.select("#j_userCount span")[0].text bookid = id data = { '_csrfToken': 'QpbsVhyc5zc0h21NiEweIrLMu2tFOM1RsgfZtWSS', 'bookId': bookid, 'pageSize': 15 } other_url = 'https://book.qidian.com/ajax/comment/index?' + urlencode(data) page = requests.get(other_url, stream=True) page.encoding = "utf-8" cont = eval(page.text) score = cont.get('data').get('rate') j_user_count = cont.get('data').get('userCount') # 写:追加 row = [book_name, author, words_count, click_count, books_count, score, j_user_count, crawl_time, id] with open("qidian.csv", "a", encoding="utf-8",newline='') as f: f = csv.writer(f, dialect="excel") f.writerow(row) with open("doneurl.txt", "a", newline='',encoding='utf-8') as fe: fe.write(url + '\n') fe.close() except BaseException: if count < 5: print('errror 元素获取失败 重试次数:' + str(count)) time.sleep(2) work(url, count+1) else: with open("error_url.txt", "a", encoding='utf-8') as fe: fe.write(url + '\n') print('errror 元素获取失败 写入文件') fe.close() #定义采集单页函数 # @retry(tries=5, delay=1) # def load(i): # url="https://www.qidian.com/all?page="+str(i) # print("正在采集页面:{}".format(url)) # page=requests.get(url) # page.encoding="utf-8" # soup = BeautifulSoup(page.text, 'lxml') # elem=soup.select(".book-mid-info h4 a")#选取url # urls=[] # for j in range(0,20): # url = 'https:' + elem[j].get('href') # urls.append(url) # if len(urls)!=20: # raise Exception(BaseException, i) # with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件 # for cont in urls: # f.write(str(cont)+'\n')def load(i,count=0): try: url="https://www.qidian.com/all?page="+str(i) print("正在采集页面:{}".format(url)) page=requests.get(url) page.encoding="utf-8" soup = BeautifulSoup(page.text, 'lxml') elem=soup.select(".book-mid-info h4 a")#选取url urls=[] for j in range(0,20): url = 'https:' + elem[j].get('href') urls.append(url) if len(urls)!=20: raise Exception(BaseException, i) with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件 for cont in urls: f.write(str(cont)+'\n') except BaseException as e: if count<5: load(i,count+1) else: print(str(e)) with open('urllist.txt','a',encoding='utf-8') as fp: fp.write(url+' '+i+'\n') def loadurl(start,end,thrednum): links = [] for i in range(start,end+1):#自定义页数 links.append(i) #开始采集小说url print(len(links)) try: pool = threadpool.ThreadPool(thrednum) # 线程池 requests = threadpool.makeRequests(load, links) [pool.putRequest(req) for req in requests] pool.wait() except KeyboardInterrupt: print('手动暂停') def spider(start=1,end=2500,thrednum=10): #输入文件输出文件 #采集每本小说url储存到文件 loadurl(start,end,thrednum) #将url读取到list with open('urls.txt', 'r+', encoding='utf-8') as f: links = [] url = f.readline().strip('\n') while url: links.append(url) url = f.readline().strip('\n') #links=links[0:2500] #开始采集每条记录 init() try: pool = threadpool.ThreadPool(thrednum) # 线程池 requests = threadpool.makeRequests(work, links) [pool.putRequest(req) for req in requests] pool.wait() except KeyboardInterrupt: print('手动暂停')spider(1,2500,50)
相关推荐
-
python加密ID解密 python
2019-1-8
-
微信聊天机器人 python
2019-1-8
-
12306自动购票脚本 python
2019-1-8
-
爬虫实例—ajax异步(动态)加载的页面信息爬取 python
2019-1-8
-
关于python数据库操作的学习记录 python
2019-1-8
-
爬取淘女郎前100页所有图片 python
2019-1-8
-
【Python3.6】糗事百科爬虫 python
2019-1-8
-
python统计项目代码行数 python
2019-1-8
-
tensorflow训练词向量 python
2019-1-8
-
爬虫起点中文网 python
2019-1-8