flowesh

1.2.4 • Public • Published

Flowesh

Flowesh is the non-cluster version of floodesh. It's a middleware based web spider which is lightweight and easy to maintain

Install

npm install flowesh

Usage

function Spider(){
    this.name = 'MySpider'
}

Spider.prototype = {
    seed:[{
        opt:{
            uri:'http://www.baidu.com'
        },
        next:'parse'
    }],
    onInit:function(done){
        // do whatever you want on the init stage
       this.seed.push({
            uri:'http://www.hao123.com',
            priority:3
       });
       this.seed.push({
            uri:'http://www.qq.com',
            priority:1
       });
       done();
    },
    // onData is optional, happens after the response goes through all the middlewares
    onDate:function(dataSet){
        console.log(dataSet.get('data'));
    },
    // onComplete is optional, happens after onData
    onComplete:function(ctx){
        console.log('%s complete', ctx.request.url);
    },
    parse:function(ctx, done){
        console.log(ctx.content.match(/<title>(.*?)<\title>/)[1]);
        // if you have new tasks generated
        ctx.tasks.push({
            opt:{
                uri:'http://www.163.com',
                priority:0
            },
            next:'parse'
        });
        done();
    }
}

const Flowesh = require('flowesh'),
    // request middleware that detects charset of reponse
    charsetparser = require('mof-charsetparser'),
    // request middleware that converts response to utf-8 encoding
    iconv = require('mof-iconv'),
    // response middleware that loads response into a jQuery object which has the same usage as jQuery
    cheerio = require('mof-cheerio'),
    // request middleware that corrects your queue options, e.g. attribute 'jquery' in your queue option will be changed into 'jQuery'
    normalizer = require('mof-normalizer'),
    // request middleware that adapts your queue options to meet request(https://github.com/request/request) requirements
    reqadapter = require('mof-reqadapter');

const config = {
    "schedule":{
        "concurrent": 1,
        "rate": 5000,
        "priorityRange":10 // default 10
    },
    "request":{
        "retry":3
    }
}

const flowesh = new Flowesh(config).attach(new Spider());

// middlewares will be executed in order

flowesh.requestmw.use(normalizer());
flowesh.requestmw.use(reqadapter());

flowesh.responsemw.use(charsetparser());
flowesh.responsemw.use(iconv());
flowesh.responsemw.use(cheerio());

flowesh.start();

Package Sidebar

Install

npm i flowesh

Weekly Downloads

0

Version

1.2.4

License

ISC

Last publish

Collaborators

  • darrenqc