Python + pyspider知轩藏书的爬虫，下载到本地

1,130次阅读

共计 2784 个字符，预计需要花费 7 分钟才能阅读完成。

知轩藏书的爬虫，源码放出，依旧是 Python + pyspider，因为 pyspider 框架可以 WEB 进行管理，方便管理，只要把代码复制进去运行就可。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-04-28 1:24:37
# Project: Zxcs
from pyspider.libs.base_handler import *
import re
import os
import codecs
import sys
import urllib2,HTMLParser,re
class Handler(BaseHandler):
    global Datos
    Datos = {}
    global P_dir
    #小说下载保存路径
    P_dir =‘/Home/Book’headers= {‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8′,‘Accept-Encoding’:’gzip, deflate, sdch’,‘Accept-Language’:’zh-CN,zh;q=0.8′,‘Cache-Control’:’max-age=0′,‘Connection’:’keep-alive’,‘User-Agent’:’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36′
    }
    crawl_config = {‘headers’: headers,‘timeout’: 300
    }
    crawl_config = { }
    def __init__(self):
        #网址请自行修改
        self.base_url1 =‘http://www.zxcs.me/sort/’self.base_url2 =‘page/’self.CaterId = []
        self.CaterIds = [’23’,’25’,’26’,’27’,’28’,’29’,’36’,’37’,’38’,’39’,’40’,’41’,’42’,’43’,’44’,’45’,’55’]
        self.page_num = 1
        self.total_num = 5
    @every(minutes=24 * 60)
    def on_start(self):
        global Cater_Name
        #自定义一下分类的名称 (习惯了，因为自己入库时候需要用到)
        Cater_Name = []
        while self.page_num <= self.total_num:
            for self.CaterId in self.CaterIds:
                if self.CaterId  ==’26’:
                     Cater_Name =‘玄幻’if self.CaterId  ==’38’:
                     Cater_Name =‘玄幻’if self.CaterId  ==’39’:
                     Cater_Name =‘玄幻’if self.CaterId  ==’25’:
                     Cater_Name =‘武侠’if self.CaterId  ==’36’:
                    Cater_Name =‘武侠’if self.CaterId  ==’37’:
                    Cater_Name =‘武侠’if self.CaterId  ==’28’:
                    Cater_Name =‘历史’if self.CaterId  ==’42’:
                    Cater_Name =‘历史’if self.CaterId  ==’43’:
                    Cater_Name =‘历史’if self.CaterId  ==’23’:
                    Cater_Name =‘都市’if self.CaterId  ==’27’:
                    Cater_Name =‘科幻’if self.CaterId  ==’40’:
                    Cater_Name =‘科幻’if self.CaterId  ==’41’:
                    Cater_Name =‘科幻’if self.CaterId  ==’29’:
                    Cater_Name =‘游戏’if self.CaterId  ==’44’:
                    Cater_Name =‘游戏’if self.CaterId  ==’45’:
                    Cater_Name =‘游戏’if self.CaterId  ==’55’:
                    Cater_Name =‘都市’print self.CaterId
                url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) +“/”self.crawl(url, callback=self.index_page,save=Cater_Name)
            self.page_num += 1
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc(‘#plist dd > div > a[href^=”http”]’).items():
            self.crawl(each.attr.href, callback=self.domain_page)
    def domain_page(self, response):
        for each in response.doc(‘div[class=”filecont”] a[href^=”http”]’).items():
            Down = each.attr.href
            Name = response.doc(‘.filetit > a’).text()
            self.crawl(each.attr.href, callback=self.detail_page)
    @config(priority=2)
    def detail_page(self, response):
        file_name = response.doc(‘h2’).text()
        for each in response.doc(‘.panel-body a[href^=”http”]’).items():
            Down = each.attr.href
            if(self.download(P_dir,file_name,Down)):
                print(‘attachment url is‘+ Down)
            return {“url”: response.url,“title”: response.doc(‘title’).text(),“Down”: Down,“file_name”:response.doc(‘h2’).text(),}
    #文件下载    
    def download(self, P_dir, file_name, Down):
        file = P_dir +“/”+ file_name +“.rar”f = open(file,“wb+”)
        print f
        f.write(Down)
        f.close()

原文 https://www.sxsay.com/872.html

正文完