Python + pyspider知轩藏书的爬虫,下载到本地
知轩藏书的爬虫,源码放出,依旧是Python + pyspider ,因为pyspider框架可以WEB进行管理,方便管理,只要把代码复制进去运行就可。
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2019-04-28 1:24:37 # Project: Zxcs from pyspider.libs.base_handler import * import re import os import codecs import sys import urllib2,HTMLParser,re class Handler(BaseHandler): global Datos Datos = {} global P_dir #小说下载保存路径 P_dir = ‘/Home/Book’ headers= { ‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8′, ‘Accept-Encoding’:’gzip, deflate, sdch’, ‘Accept-Language’:’zh-CN,zh;q=0.8′, ‘Cache-Control’:’max-age=0′, ‘Connection’:’keep-alive’, ‘User-Agent’:’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36′ } crawl_config = { ‘headers’ : headers, ‘timeout’ : 300 } crawl_config = { } def __init__(self): #网址请自行修改 self.base_url1 = ‘http://www.zxcs.me/sort/’ self.base_url2 = ‘page/’ self.CaterId = [] self.CaterIds = [’23’, ’25’, ’26’, ’27’, ’28’, ’29’, ’36’, ’37’, ’38’, ’39’, ’40’, ’41’, ’42’, ’43’, ’44’, ’45’, ’55’] self.page_num = 1 self.total_num = 5 @every(minutes=24 * 60) def on_start(self): global Cater_Name #自定义一下分类的名称(习惯了,因为自己入库时候需要用到) Cater_Name = [] while self.page_num <= self.total_num: for self.CaterId in self.CaterIds: if self.CaterId == ’26’: Cater_Name = ‘玄幻’ if self.CaterId == ’38’: Cater_Name = ‘玄幻’ if self.CaterId == ’39’: Cater_Name = ‘玄幻’ if self.CaterId == ’25’: Cater_Name = ‘武侠’ if self.CaterId == ’36’: Cater_Name = ‘武侠’ if self.CaterId == ’37’: Cater_Name = ‘武侠’ if self.CaterId == ’28’: Cater_Name = ‘历史’ if self.CaterId == ’42’: Cater_Name = ‘历史’ if self.CaterId == ’43’: Cater_Name = ‘历史’ if self.CaterId == ’23’: Cater_Name = ‘都市’ if self.CaterId == ’27’: Cater_Name = ‘科幻’ if self.CaterId == ’40’: Cater_Name = ‘科幻’ if self.CaterId == ’41’: Cater_Name = ‘科幻’ if self.CaterId == ’29’: Cater_Name = ‘游戏’ if self.CaterId == ’44’: Cater_Name = ‘游戏’ if self.CaterId == ’45’: Cater_Name = ‘游戏’ if self.CaterId == ’55’: Cater_Name = ‘都市’ print self.CaterId url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + “/” self.crawl(url, callback=self.index_page,save=Cater_Name) self.page_num += 1 @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc(‘#plist dd > div > a[href^=”http”]’).items(): self.crawl(each.attr.href, callback=self.domain_page) def domain_page(self, response): for each in response.doc(‘div[class=”filecont”] a[href^=”http”]’).items(): Down = each.attr.href Name = response.doc(‘.filetit > a’).text() self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): file_name = response.doc(‘h2’).text() for each in response.doc(‘.panel-body a[href^=”http”]’).items(): Down = each.attr.href if(self.download(P_dir,file_name,Down)): print(‘attachment url is ‘ + Down) return { “url”: response.url, “title”: response.doc(‘title’).text(), “Down”: Down, “file_name”:response.doc(‘h2’).text(), } #文件下载 def download(self, P_dir, file_name, Down): file = P_dir + “/” + file_name + “.rar” f = open(file, “wb+”) print f f.write(Down) f.close()
原文https://www.sxsay.com/872.html
共有 0 条评论