共计 2784 个字符,预计需要花费 7 分钟才能阅读完成。
知轩藏书的爬虫,源码放出,依旧是 Python + pyspider,因为 pyspider 框架可以 WEB 进行管理,方便管理,只要把代码复制进去运行就可。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-04-28 1:24:37
# Project: Zxcs
from pyspider.libs.base_handler import *
import re
import os
import codecs
import sys
import urllib2,HTMLParser,re
class Handler(BaseHandler):
global Datos
Datos = {}
global P_dir
#小说下载保存路径
P_dir =‘/Home/Book’headers= {‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8′,‘Accept-Encoding’:’gzip, deflate, sdch’,‘Accept-Language’:’zh-CN,zh;q=0.8′,‘Cache-Control’:’max-age=0′,‘Connection’:’keep-alive’,‘User-Agent’:’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36′
}
crawl_config = {‘headers’: headers,‘timeout’: 300
}
crawl_config = { }
def __init__(self):
#网址请自行修改
self.base_url1 =‘http://www.zxcs.me/sort/’self.base_url2 =‘page/’self.CaterId = []
self.CaterIds = [’23’,’25’,’26’,’27’,’28’,’29’,’36’,’37’,’38’,’39’,’40’,’41’,’42’,’43’,’44’,’45’,’55’]
self.page_num = 1
self.total_num = 5
@every(minutes=24 * 60)
def on_start(self):
global Cater_Name
#自定义一下分类的名称 (习惯了,因为自己入库时候需要用到)
Cater_Name = []
while self.page_num <= self.total_num:
for self.CaterId in self.CaterIds:
if self.CaterId ==’26’:
Cater_Name =‘玄幻’if self.CaterId ==’38’:
Cater_Name =‘玄幻’if self.CaterId ==’39’:
Cater_Name =‘玄幻’if self.CaterId ==’25’:
Cater_Name =‘武侠’if self.CaterId ==’36’:
Cater_Name =‘武侠’if self.CaterId ==’37’:
Cater_Name =‘武侠’if self.CaterId ==’28’:
Cater_Name =‘历史’if self.CaterId ==’42’:
Cater_Name =‘历史’if self.CaterId ==’43’:
Cater_Name =‘历史’if self.CaterId ==’23’:
Cater_Name =‘都市’if self.CaterId ==’27’:
Cater_Name =‘科幻’if self.CaterId ==’40’:
Cater_Name =‘科幻’if self.CaterId ==’41’:
Cater_Name =‘科幻’if self.CaterId ==’29’:
Cater_Name =‘游戏’if self.CaterId ==’44’:
Cater_Name =‘游戏’if self.CaterId ==’45’:
Cater_Name =‘游戏’if self.CaterId ==’55’:
Cater_Name =‘都市’print self.CaterId
url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) +“/”self.crawl(url, callback=self.index_page,save=Cater_Name)
self.page_num += 1
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc(‘#plist dd > div > a[href^=”http”]’).items():
self.crawl(each.attr.href, callback=self.domain_page)
def domain_page(self, response):
for each in response.doc(‘div[class=”filecont”] a[href^=”http”]’).items():
Down = each.attr.href
Name = response.doc(‘.filetit > a’).text()
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
file_name = response.doc(‘h2’).text()
for each in response.doc(‘.panel-body a[href^=”http”]’).items():
Down = each.attr.href
if(self.download(P_dir,file_name,Down)):
print(‘attachment url is‘+ Down)
return {“url”: response.url,“title”: response.doc(‘title’).text(),“Down”: Down,“file_name”:response.doc(‘h2’).text(),}
#文件下载
def download(self, P_dir, file_name, Down):
file = P_dir +“/”+ file_name +“.rar”f = open(file,“wb+”)
print f
f.write(Down)
f.close()
原文 https://www.sxsay.com/872.html
正文完