scr-api

Turn web pages in to APIs.

npm install scr-api
12 downloads in the last week
24 downloads in the last month

scr-api

A schema based HTML scraping library for node.js

scr-api takes a JSON document called a schema, some HTML, and returns JSON objects based on the schema.

Example

A simple example

var scrapi = require("scr-api");

var html = "<div id'time-and-date'>" +
  "<p>Today is <span class='date'>14 May, 2013</span>.<br/> " +
  "The time is <span class='time'>11:53 AM</span></p>" +
  "</div>";

var schema = {
  "The current date": {
    "selector": ".date",
    "is":""
  },
  "The current time": {
    "selector": ".time",
    "is":""
  }
}

console.log(scrapi.scrape(html, schema));

Would return

{ 
    'The current date': '14 May, 2013',
    'The current time': '11:53 AM' 
}

Schema syntax

Schemas are of the form

{
    "I'm an object key name":                     // This value is the name of the 
                                                // property that will be scraped
    { 
        "selector": "a #valid .css .selector",     // A selector to filter HTML with,
                                                // these are compound

        "attr": "href",                         // The property of the tag to 
                                                // extract a value from, optional

        "is": "",                                 // either "", [], or {}     

        "type":    "int",                            // how to interpret the value of 
                                                // the text found by the selector
                                                 // Either:     string
                                                 //             int
                                                 //             float
                                                 //             bool
                                                 //             boolTextExists
                                                 //             boolHtmlExists


        "of": < anotherSchemaDefiniton >         // Schema definitions are recursive,
                                                // they can be arbitrarily nested
    }
}

Scrapi schemas can be arbitrarily nested

{
    "I'm an object key name": {
        "selector": "a #valid .css",
        "is": [],
        "of": {
            "So am I": {
                "selector": ".selector",
                "is": {},
                "of": {
                    "I'm a turtle": {
                        "selector": "a #valid .css .selector",
                        "is": [],
                        "of": < turtlesAllTheWayDown >
                    }
                }
            }
        }
    }
}

Complex example

For a more complex example, let's scrape the hackernews headlines

var request = require('request'),
    scrapi = require("scr-api");

request({
    uri: "http://news.ycombinator.com",
}, function(err, res, body) {

    var schema = {
        "Headlines": {
            "selector": "table table .title a",
            "is": [],
            "of": {
                "Headline": {
                    "is": ""
                },
                "Link": {
                    "is": "",
                    "attr": "href"
                }
            }
        }
    };

    console.log(scrapi.scrape(body, schema));
});

Would then return

{
    Headlines: [{
        Headline: 'Go 1.1 is released',
        Link: 'http://blog.golang.org/2013/05/go-11-is-released.html'
    }, {
        Headline: 'U.S. Secretly Obtains Two Months of A.P. Phone Records',
        Link: 'http://www.nytimes.com/aponline/2013/05/13/us/politics/ap-us-ap-phone-records- subpoena.html?pagewanted=all'
    }, 
    ... 
    ... 
    ...]
}
npm loves you