crawler-url-parser

2.0.5 • Public • Published

crawler-url-parser

An URL parser for crawling purpose

version downloads node status

Installation

npm install crawler-url-parser

Usage

Parse

const cup = require('crawler-url-parser');
 
//// parse(current_url[,base_url])
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
 
console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
 
console.log(result.baseurl);
// null
 
console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
 
console.log(result.host); 
// question.stackoverflow.com
 
console.log(result.domain); 
// stackoverflow.com
 
console.log(result.subdomain); 
// question
 
console.log(result.protocol); 
// http:
 
console.log(result.path); 
// /aaa/bbb/ddd
 
console.log(result.search); 
// q1=query1&q2=query2
 
console.log(result.querycount); 
// 2

Parse with baseURL

const cup = require('crawler-url-parser');
 
//// parse(current_url[,base_url])
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");
 
console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
 
console.log(result.baseurl);
// http://question.stackoverflow.com/aaa/bbb/ccc
 
console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
 
console.log(result.host); 
// question.stackoverflow.com
 
console.log(result.domain); 
// stackoverflow.com
 
console.log(result.subdomain); 
// question
 
console.log(result.protocol); 
// http:
 
console.log(result.path); 
// /aaa/bbb/ddd
 
console.log(result.search); 
// q1=query1&q2=query2
 
console.log(result.querycount); 
// 2

Extract

const cup = require('crawler-url-parser');
 
//// extract(html_str,current_url);
let htmlStr='<html><body> \
    <a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
    <a href="http://faq.stackoverflow.com">subdomain</a><br /> \
    <a href="http://stackoverflow.com">updomain</a><br /> \
    <a href="http://www.google.com">external</a><br /> \
    <a href="http://www.facebook.com">external</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
    <a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
</body></html>';
 
let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
let urls = cup.extract(htmlStr,currentUrl);
 
console.log(urls[0].type); //subdomain
console.log(urls[1].type); //subdomain
console.log(urls[2].type); //updomain
console.log(urls[3].type); //external
console.log(urls[4].type); //external
console.log(urls[5].type); //sublevel
console.log(urls[6].type); //sublevel
console.log(urls[7].type); //uplevel
console.log(urls[8].type); //samelevel
console.log(urls[9].type); //samelevel
console.log(urls[10].type); //internal
console.log(urls[11].type); //subdomain
 

Level

const cup = require('crawler-url-parser');
 
//// gettype(current_url,base_url);
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level); //sublevel
 
level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level); //uplevel
 
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level); //samelevel
 
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level); //external

Test

Support

I use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.

WhatsApp

Submitting an Issue

If you find a bug or a mistake, you can help by submitting an issue to GitLab Repository

Creating a Merge Request

GitLab calls it merge request instead of pull request.

License

MIT licensed and all it's dependencies are MIT or BSD licensed.

Package Sidebar

Install

npm i crawler-url-parser

Weekly Downloads

40

Version

2.0.5

License

MIT

Last publish

Collaborators

  • mehmet.kozan