东流西上
故在柳溪,水东有柳

Python + pyspider知轩藏书的爬虫,下载到本地

阿里云主机

知轩藏书的爬虫,源码放出,依旧是Python + pyspider ,因为pyspider框架可以WEB进行管理,方便管理,只要把代码复制进去运行就可。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-04-28 1:24:37
# Project: Zxcs
from pyspider.libs.base_handler import *
import re
import os
import codecs
import sys
import urllib2,HTMLParser,re
class Handler(BaseHandler):
    global Datos
    Datos = {}
    global P_dir
    #小说下载保存路径
    P_dir = ‘/Home/Book’
    headers= {
    ‘Accept’:’text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8′,
    ‘Accept-Encoding’:’gzip, deflate, sdch’,
    ‘Accept-Language’:’zh-CN,zh;q=0.8′,
    ‘Cache-Control’:’max-age=0′,
    ‘Connection’:’keep-alive’,
    ‘User-Agent’:’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36′
    }
    crawl_config = {
        ‘headers’ : headers,
        ‘timeout’ : 300
    }
    crawl_config = {
    }
    def __init__(self):
        #网址请自行修改
        self.base_url1 = ‘http://www.zxcs.me/sort/’
        self.base_url2 = ‘page/’
        self.CaterId = []
        self.CaterIds = [’23’, ’25’, ’26’, ’27’, ’28’, ’29’, ’36’, ’37’, ’38’, ’39’, ’40’, ’41’, ’42’, ’43’, ’44’, ’45’, ’55’]
        self.page_num = 1
        self.total_num = 5
    @every(minutes=24 * 60)
    def on_start(self):
        global Cater_Name
        #自定义一下分类的名称(习惯了,因为自己入库时候需要用到)
        Cater_Name = []
        while self.page_num <= self.total_num:
            for self.CaterId in self.CaterIds:
                if self.CaterId  == ’26’:
                     Cater_Name = ‘玄幻’
                if self.CaterId  == ’38’:
                     Cater_Name = ‘玄幻’
                if self.CaterId  == ’39’:
                     Cater_Name = ‘玄幻’
                if self.CaterId  == ’25’:
                     Cater_Name = ‘武侠’
                if self.CaterId  == ’36’:
                    Cater_Name = ‘武侠’
                if self.CaterId  == ’37’:
                    Cater_Name = ‘武侠’
                if self.CaterId  == ’28’:
                    Cater_Name = ‘历史’
                if self.CaterId  == ’42’:
                    Cater_Name = ‘历史’
                if self.CaterId  == ’43’:
                    Cater_Name = ‘历史’
                if self.CaterId  == ’23’:
                    Cater_Name = ‘都市’
                if self.CaterId  == ’27’:
                    Cater_Name = ‘科幻’
                if self.CaterId  == ’40’:
                    Cater_Name = ‘科幻’
                if self.CaterId  == ’41’:
                    Cater_Name = ‘科幻’
                if self.CaterId  == ’29’:
                    Cater_Name = ‘游戏’
                if self.CaterId  == ’44’:
                    Cater_Name = ‘游戏’
                if self.CaterId  == ’45’:
                    Cater_Name = ‘游戏’
                if self.CaterId  == ’55’:
                    Cater_Name = ‘都市’
                print self.CaterId
                url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + “/”
                self.crawl(url, callback=self.index_page,save=Cater_Name)
            self.page_num += 1
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc(‘#plist dd > div > a[href^=”http”]’).items():
            self.crawl(each.attr.href, callback=self.domain_page)
    def domain_page(self, response):
        for each in response.doc(‘div[class=”filecont”] a[href^=”http”]’).items():
            Down = each.attr.href
            Name = response.doc(‘.filetit > a’).text()
            self.crawl(each.attr.href, callback=self.detail_page)
    @config(priority=2)
    def detail_page(self, response):
        file_name = response.doc(‘h2’).text()
        for each in response.doc(‘.panel-body a[href^=”http”]’).items():
            Down = each.attr.href
            if(self.download(P_dir,file_name,Down)):
                print(‘attachment url is ‘ + Down)
            return {
            “url”: response.url,
            “title”: response.doc(‘title’).text(),
            “Down”: Down,
            “file_name”:response.doc(‘h2’).text(),
        }
    #文件下载    
    def download(self, P_dir, file_name, Down):
        file = P_dir + “/” + file_name + “.rar”
        f = open(file, “wb+”)
        print f
        f.write(Down)
        f.close()

原文https://www.sxsay.com/872.html

赞(0) 打赏
转载请注明出处:水东柳博客 » Python + pyspider知轩藏书的爬虫,下载到本地
分享到: 更多 (0)

评论 抢沙发

9 + 8 =
  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏