Json structure

General structure

Let’s see json example:

{
    "id": 1446997850,
    "crawlerType": 4,
    "maxIterations": "1",
    "info": {
        "description": "Site bbc.com",
        "templates": {
            "names": [
                "bbc.com"
            ]
        }
    },
    "items": [
        {
            "siteId": "0",
            "urlContentResponse": null,
            "depth": "1",
            "siteObj": {
                "fetchType": 1,
                "id": "0",
                "uDate": null,
                "tcDate": null,
                "cDate": null,
                "resources": null,
                "iterations": null,
                "description": null,
                "urls": [
                ],
                "filters": [
                    {
                        "pattern": "http(.*)",
                        "siteId": "0",
                        "type": 1
                    },
                    {
                        "mode": 0,
                        "pattern": "%MAX_DEPTH%",
                        "subject": "1",
                        "siteId": "0",
                        "type": 1,
                        "opCode": 4,
                        "stage": 0,
                        "action": 1
                    }
                ],
                "properties": {
                },
                "state": null,
                "priority": null,
                "maxURLs": null,
                "maxResources": null,
                "maxErrors": null,
                "maxResourceSize": null,
                "requestDelay": null,
                "httpTimeout": null,
                "errorMask": null,
                "errors": null,
                "urlType": null,
                "contents": null,
                "processingDelay": null,
                "size": null,
                "avgSpeed": null,
                "avgSpeedCounter": null,
                "userId": null,
                "recrawlPeriod": null,
                "recrawlDate": null,
                "maxURLsFromPage": null,
                "collectedURLs": null
            },
            "urlObj": {
                "status": 2,
                "linksI": 0,
                "linksE": 0,
                "contentMask": 0,
                "processingTime": 0,
                "CDate": null,
                "mRateCounter": 0,
                "httpTimeout": 10000,
                "size": 0,
                "urlPut": null,
                "batchId": 0,
                "lastModified": null,
                "tagsCount": 0,
                "mRate": 0,
                "charset": "",
                "state": 0,
                "httpCode": 0,
                "priority": 0,
                "maxURLsFromPage": null,
                "processingDelay": 0,
                "crawlingTime": 0,
                "type": 1,
                "processed": 0,
                "totalTime": 0,
                "siteSelect": 0,
                "contentType": "",
                "pDate": null,
                "errorMask": 0,
                "httpMethod": "get",
                "eTag": "",
                "siteId": "0",
                "freq": 0,
                "tcDate": null,
                "rawContentMd5": "",
                "crawled": 0,
                "UDate": null,
                "contentURLMd5": "",
                "requestDelay": 0,
                "depth": 0,
                "parentMd5": "",
                "urlUpdate": null,
                "tagsMask": 0,
                "urlMd5": "4e19678f4a2ca00057f029047b61e74c",
                "url": "http:\/\/www.bbc.com\/news\/world-middle-east-34107395"
            },
            "urlPutObj": {
                "putDict": {
                },
                "urlMd5": "4e19678f4a2ca00057f029047b61e74c",
                "contentType": 0,
                "siteId": "0",
                "fileStorageSuffix": null,
                "criterions": null
            },
            "properties": {
                "DB_TASK_MODE": "RO",
                "HTTP_REDIRECTS_MAX": 5,
                "HTML_REDIRECTS_MAX": 5,
                "HTML_RECOVER": "0",
                "ROBOTS_MODE": "0",
                "PROCESSOR_PROPERTIES": "{\"algorithm\":{\"algorithm_name\":\"user_name_algorithm\"},\"modules\":{\"user_name_algorithm\":[\"GooseExtractor\",\"NewspaperExtractor\",\"ScrapyExtractor\"]},\"SCRAPER_DOWNLOAD_IMAGES\":1,\"SCRAPER_TEXT_MARKUP\":{\"DIV\":\"\\n\",\"P\":\"\\n\",\"H\":\"\\n\",\"TR\":\"\\n\"}}",
                "template": {
                    "templates": [
                        {
                            "output_format": {
                                "type": "news",
                                "name": "json",
                                "header": "[\n",
                                "items_header": "",
                                "item": "{\n\"pubdate\":\"%pubdate%\",\n\"title\":\"%title%\",\n\"description\":\"%description%\",\n\"media\":\"%media%\",\n\"author\":\"%author%\",\n\"dc_date\":\"%dc_date%\",\n\"link\":\"%link%\",\n\"keywords\":\"%keywords%\",\n\"content_encoded\":\"%content_encoded%\",\n\"html_lang\":\"%html_lang%\",\n\"pubdate_extractor\":\"%pubdate_extractor%\",\n\"title_extractor\":\"%title_extractor%\",\n\"description_extractor\":\"%description_extractor%\",\n\"media_extractor\":\"%media_extractor%\",\n\"author_extractor\":\"%author_extractor%\",\n\"dc_date_extractor\":\"%dc_date_extractor%\",\n\"link_extractor\":\"%link_extractor%\",\n\"keywords_extractor\":\"%keywords_extractor%\",\n\"content_encoded_extractor\":\"%content_encoded_extractor%\",\n\"html_lang_extractor\":\"%html_lang_extractor%\",\n\"crawler_time\":\"%crawler_time%\",\n\"scraper_time\":\"%scraper_time%\",\n\"errors_mask\":\"%errors_mask%\"\n}\n",
                                "items_footer": "",
                                "footer": "]\n"
                            },
                            "tags": {
                                "pubdate": [
                                ],
                                "title": [
                                ],
                                "description": [
                                ],
                                "media": [
                                ],
                                "author": [
                                ],
                                "dc_date": [
                                ],
                                "link": [
                                ],
                                "keywords": [
                                ],
                                "content_encoded": [
                                ],
                                "html_lang": [
                                ],
                                "pubdate_extractor": [
                                ],
                                "title_extractor": [
                                ],
                                "description_extractor": [
                                ],
                                "media_extractor": [
                                ],
                                "author_extractor": [
                                ],
                                "dc_date_extractor": [
                                ],
                                "link_extractor": [
                                ],
                                "keywords_extractor": [
                                ],
                                "content_encoded_extractor": [
                                ],
                                "html_lang_extractor": [
                                ],
                                "crawler_time": [
                                ],
                                "scraper_time": [
                                ],
                                "errors_mask": [
                                ]
                            },
                            "priority": 100,
                            "mandatory": 1,
                            "is_filled": 0
                        }
                    ],
                    "select": "first_nonempty"
                }
            },
            "urlId": "4e19678f4a2ca00057f029047b61e74c"
        }
    ]
}

Json, without taking into account its size, is clearly simple, you should not be shocked of its size. If we try to structure it, we will get something like:

{
    "id": 1,
    "crawlerType": 4,
    "maxIterations": "1",
    "info": {...}
    "items": [
        {   "siteId": "0",
            "urlContentResponse": null,
            "depth": "1",
            "siteId": "0",
            "urlContentResponse": null,
            "siteObj": {...},
            "urlObj": {...},
            "urlPutObj": {...},
            "properties": {
                "listofproperties": "list",
                "template": {
                    "templates": [
                        {
                            "output_format": {...},
                            "tags": {},
                            "priority": 100,
                            "mandatory": 1,
                            "is_filled": 0
                        }
                    ],
                    "select": "first_nonempty"
                }
            },
            "urlId": "b7632cc979f402538f516e48379f9101"
        }
    ]
}

In this format it becomes more understandable, isn’t it? So, as you can see from example, the biggest part of json takes items array. It also has subarrays and contains all settings of URL chosen for scrapping. Main difference between three scrapping types – news, RSS and template based, lies in items.properties array. It defines scrapper mechanism and output response format. And one more detail: if you want to send several URLs in one request, you surely can do it, just add self items array with all settings for every URL (and use POST method, because GET doesn’t support such large requests).
Before we start, if you don’t need to understand principles of jsons formation: you can always add your settings through Tars Reaper (Demo form), execute request and copy or download ready json from ‘API request json’ tab or Settings/Export settings button accordingly.

General parameters

id Request id. Actually, this is just identificator, that you can change for any value you want, or just leave it as in example;
info.description you may add here your own description;
info.templates.names is actually used for scrapping template , but you also can enter here any description name you want to identify your template;
items.siteObj.fetchType 1 – static (default), 2 – dynamic, 3 – external;
items.siteObj.filters.pattern this is regular expressions, filter for requested URLs – if you leave them as in example – http(.*) – this will mean that under such filter you get any URL with http or https protocol. If you want to receive results limited to source URL, you must build regex something like this
:

^(?:http(?:s)?:\/\/)?(?:[^\.]+\.)?site\.com(.*)

urlObj.items.siteObj.httpTimeout timeout for operation. If you get empty response – try to enlarge this parameter
items.urlObj.type Used to define type of scrapping – news, template based or RSS. However changing only this field will not be enough to change the type. Here we consider news format, so this field must be as in example;
items.urlObj.httpMethod method for request URL
items.urlObj.urlMd5 Basically, you can substitute any value, but in order to avoid crossing identificators in DB we calculate it as md5 from URL; must be the same for all same elements in json;
url Here you can change the URLs, for which you execute API requests. But first of all you must escape the characters as in example;
items.properties.HTTP_REDIRECTS_MAX and properties.HTML_REDIRECTS_MAX Corresponding means max number of redirects, you can configure it as you need;
items.properties.ROBOTS_MODE Means accept or ignore rules set in robots.txt for the site, which URL was chosen for request. ‘0’ – ignore rules, ‘1’ – accept;

Request json structure for URL, news scraping type

So, let’s see json structure and contents of json for URL, news type. Here is json example:

{
    "id": 1446997850,
    "crawlerType": 4,
    "maxIterations": "1",
    "info": {
        "description": "Site bbc.com",
        "templates": {
            "names": [
                "bbc.com"
            ]
        }
    },
    "items": [
        {
            "siteId": "0",
            "urlContentResponse": null,
            "depth": "1",
            "siteObj": {
                "fetchType": 1,
                "id": "0",
                "uDate": null,
                "tcDate": null,
                "cDate": null,
                "resources": null,
                "iterations": null,
                "description": null,
                "urls": [
                ],
                "filters": [
                    {
                        "pattern": "http(.*)",
                        "siteId": "0",
                        "type": 1
                    },
                    {
                        "mode": 0,
                        "pattern": "%MAX_DEPTH%",
                        "subject": "1",
                        "siteId": "0",
                        "type": 1,
                        "opCode": 4,
                        "stage": 0,
                        "action": 1
                    }
                ],
                "properties": {
                },
                "state": null,
                "priority": null,
                "maxURLs": null,
                "maxResources": null,
                "maxErrors": null,
                "maxResourceSize": null,
                "requestDelay": null,
                "httpTimeout": null,
                "errorMask": null,
                "errors": null,
                "urlType": null,
                "contents": null,
                "processingDelay": null,
                "size": null,
                "avgSpeed": null,
                "avgSpeedCounter": null,
                "userId": null,
                "recrawlPeriod": null,
                "recrawlDate": null,
                "maxURLsFromPage": null,
                "collectedURLs": null
            },
            "urlObj": {
                "status": 2,
                "linksI": 0,
                "linksE": 0,
                "contentMask": 0,
                "processingTime": 0,
                "CDate": null,
                "mRateCounter": 0,
                "httpTimeout": 10000,
                "size": 0,
                "urlPut": null,
                "batchId": 0,
                "lastModified": null,
                "tagsCount": 0,
                "mRate": 0,
                "charset": "",
                "state": 0,
                "httpCode": 0,
                "priority": 0,
                "maxURLsFromPage": null,
                "processingDelay": 0,
                "crawlingTime": 0,
                "type": 1,
                "processed": 0,
                "totalTime": 0,
                "siteSelect": 0,
                "contentType": "",
                "pDate": null,
                "errorMask": 0,
                "httpMethod": "get",
                "eTag": "",
                "siteId": "0",
                "freq": 0,
                "tcDate": null,
                "rawContentMd5": "",
                "crawled": 0,
                "UDate": null,
                "contentURLMd5": "",
                "requestDelay": 0,
                "depth": 0,
                "parentMd5": "",
                "urlUpdate": null,
                "tagsMask": 0,
                "urlMd5": "4e19678f4a2ca00057f029047b61e74c",
                "url": "http:\/\/www.bbc.com\/news\/world-middle-east-34107395"
            },
            "urlPutObj": {
                "putDict": {
                },
                "urlMd5": "4e19678f4a2ca00057f029047b61e74c",
                "contentType": 0,
                "siteId": "0",
                "fileStorageSuffix": null,
                "criterions": null
            },
            "properties": {
                "DB_TASK_MODE": "RO",
                "HTTP_REDIRECTS_MAX": 5,
                "HTML_REDIRECTS_MAX": 5,
                "HTML_RECOVER": "0",
                "ROBOTS_MODE": "0",
                "PROCESSOR_PROPERTIES": "{\"algorithm\":{\"algorithm_name\":\"user_name_algorithm\"},\"modules\":{\"user_name_algorithm\":[\"GooseExtractor\",\"NewspaperExtractor\",\"ScrapyExtractor\"]},\"SCRAPER_DOWNLOAD_IMAGES\":1,\"SCRAPER_TEXT_MARKUP\":{\"DIV\":\"\\n\",\"P\":\"\\n\",\"H\":\"\\n\",\"TR\":\"\\n\"}}",
                "template": {
                    "templates": [
                        {
                            "output_format": {
                                "type": "news",
                                "name": "json",
                                "header": "[\n",
                                "items_header": "",
                                "item": "{\n\"pubdate\":\"%pubdate%\",\n\"title\":\"%title%\",\n\"description\":\"%description%\",\n\"media\":\"%media%\",\n\"author\":\"%author%\",\n\"dc_date\":\"%dc_date%\",\n\"link\":\"%link%\",\n\"keywords\":\"%keywords%\",\n\"content_encoded\":\"%content_encoded%\",\n\"html_lang\":\"%html_lang%\",\n\"pubdate_extractor\":\"%pubdate_extractor%\",\n\"title_extractor\":\"%title_extractor%\",\n\"description_extractor\":\"%description_extractor%\",\n\"media_extractor\":\"%media_extractor%\",\n\"author_extractor\":\"%author_extractor%\",\n\"dc_date_extractor\":\"%dc_date_extractor%\",\n\"link_extractor\":\"%link_extractor%\",\n\"keywords_extractor\":\"%keywords_extractor%\",\n\"content_encoded_extractor\":\"%content_encoded_extractor%\",\n\"html_lang_extractor\":\"%html_lang_extractor%\",\n\"crawler_time\":\"%crawler_time%\",\n\"scraper_time\":\"%scraper_time%\",\n\"errors_mask\":\"%errors_mask%\"\n}\n",
                                "items_footer": "",
                                "footer": "]\n"
                            },
                            "tags": {
                                "pubdate": [
                                ],
                                "title": [
                                ],
                                "description": [
                                ],
                                "media": [
                                ],
                                "author": [
                                ],
                                "dc_date": [
                                ],
                                "link": [
                                ],
                                "keywords": [
                                ],
                                "content_encoded": [
                                ],
                                "html_lang": [
                                ],
                                "pubdate_extractor": [
                                ],
                                "title_extractor": [
                                ],
                                "description_extractor": [
                                ],
                                "media_extractor": [
                                ],
                                "author_extractor": [
                                ],
                                "dc_date_extractor": [
                                ],
                                "link_extractor": [
                                ],
                                "keywords_extractor": [
                                ],
                                "content_encoded_extractor": [
                                ],
                                "html_lang_extractor": [
                                ],
                                "crawler_time": [
                                ],
                                "scraper_time": [
                                ],
                                "errors_mask": [
                                ]
                            },
                            "priority": 100,
                            "mandatory": 1,
                            "is_filled": 0
                        }
                    ],
                    "select": "first_nonempty"
                }
            },
            "urlId": "4e19678f4a2ca00057f029047b61e74c"
        }
    ]
}

and you don’t need to know how all fields work, just some defining. Those fields, that we are not considering, must be sent without changes, as from example;
General parameters were reviewed in details above, and following parameters are custom for news format:
items.properties.PROCESSOR_PROPERTIES this setting defines what built-in scrapper and in what order you want to use: GooseExtractor, NewspaperExtractor and ScrapyExtractor. By default, as you see in example, we use all of it, in order, that we believe is optimal. If you want to change the order, you will get something like this:

"{
    \"algorithm\": {
        \"algorithm_name\": \"user_name_algorithm\"
    },
    \"modules\": {
        \"user_name_algorithm\": [
            \"NewspaperExtractor\",
            \"GooseExtractor\",
            \"ScrapyExtractor\"
        ]
    },
    \"SCRAPER_DOWNLOAD_IMAGES\": 1,
    \"SCRAPER_TEXT_MARKUP\": {
        \"DIV\": \"\\n\",
        \"P\": \"\\n\",
        \"H\": \"\\n\",
        \"TR\": \"\\n\"
    }
}"

If you don’t want use some of them:

"{
    \"algorithm\": {
        \"algorithm_name\": \"user_name_algorithm\"
    },
    \"modules\": {
        \"user_name_algorithm\": [
            \"GooseExtractor\",
            \"ScrapyExtractor\"
        ]
    },
    \"SCRAPER_DOWNLOAD_IMAGES\": 1,
    \"SCRAPER_TEXT_MARKUP\": {
        \"DIV\": \"\\n\",
        \"P\": \"\\n\",
        \"H\": \"\\n\",
        \"TR\": \"\\n\"
    }
}"

items.properties.template.templates.output format Array with settings of response format. You can use ‘json’, ‘html’, ‘csv’, ‘text’, ‘xml’ and ‘sql’. Here are examples of all arrays for different types:

json
 "output_format": {
                                "type": "news",
                                "name": "json",
                                "header": "[\n",
                                "items_header": "",
                                "item": "{\n\"pubdate\":\"%pubdate%\",\n\"title\":\"%title%\",\n\"description\":\"%description%\",\n\"media\":\"%media%\",\n\"author\":\"%author%\",\n\"dc_date\":\"%dc_date%\",\n\"link\":\"%link%\",\n\"keywords\":\"%keywords%\",\n\"content_encoded\":\"%content_encoded%\",\n\"html_lang\":\"%html_lang%\",\n\"pubdate_extractor\":\"%pubdate_extractor%\",\n\"title_extractor\":\"%title_extractor%\",\n\"description_extractor\":\"%description_extractor%\",\n\"media_extractor\":\"%media_extractor%\",\n\"author_extractor\":\"%author_extractor%\",\n\"dc_date_extractor\":\"%dc_date_extractor%\",\n\"link_extractor\":\"%link_extractor%\",\n\"keywords_extractor\":\"%keywords_extractor%\",\n\"content_encoded_extractor\":\"%content_encoded_extractor%\",\n\"html_lang_extractor\":\"%html_lang_extractor%\",\n\"crawler_time\":\"%crawler_time%\",\n\"scraper_time\":\"%scraper_time%\",\n\"errors_mask\":\"%errors_mask%\"\n}\n",
                                "items_footer": "",
                                "footer": "]\n"
                            },
html
 "output_format": { 
                        "type": "news", 
                        "name": "html", 
                        "header": "<!DOCTYPE html><head><title>Title<\/title><meta http-equiv=\"content-type\" content=\"text\/html; charset=UTF-8\"><\/head><body>\n", 
                        "items_header": " <table>\n", 
                        "item": "<tr><td>pubdate:<\/td><td>%pubdate%<\/td><\/tr><tr><td>title:<\/td><td>%title%<\/td><\/tr><tr><td>description:<\/td><td>%description%<\/td><\/tr><tr><td>media:<\/td><td>%media%<\/td><\/tr><tr><td>author:<\/td><td>%author%<\/td><\/tr><tr><td>dc_date:<\/td><td>%dc_date%<\/td><\/tr><tr><td>link:<\/td><td>%link%<\/td><\/tr><tr><td>keywords:<\/td><td>%keywords%<\/td><\/tr><tr><td>content_encoded:<\/td><td>%content_encoded%<\/td><\/tr><tr><td>html_lang:<\/td><td>%html_lang%<\/td><\/tr><tr><td>pubdate_extractor:<\/td><td>%pubdate_extractor%<\/td><\/tr><tr><td>title_extractor:<\/td><td>%title_extractor%<\/td><\/tr><tr><td>description_extractor:<\/td><td>%description_extractor%<\/td><\/tr><tr><td>media_extractor:<\/td><td>%media_extractor%<\/td><\/tr><tr><td>author_extractor:<\/td><td>%author_extractor%<\/td><\/tr><tr><td>dc_date_extractor:<\/td><td>%dc_date_extractor%<\/td><\/tr><tr><td>link_extractor:<\/td><td>%link_extractor%<\/td><\/tr><tr><td>keywords_extractor:<\/td><td>%keywords_extractor%<\/td><\/tr><tr><td>content_encoded_extractor:<\/td><td>%content_encoded_extractor%<\/td><\/tr><tr><td>html_lang_extractor:<\/td><td>%html_lang_extractor%<\/td><\/tr><tr><td>crawler_time:<\/td><td>%crawler_time%<\/td><\/tr><tr><td>scraper_time:<\/td><td>%scraper_time%<\/td><\/tr><tr><td>errors_mask:<\/td><td>%errors_mask%<\/td><\/tr>\n", 
                        "items_footer": " <\/table>\n", 
                         "footer": "<\/body><\/html>\n" 
                                                                   }
csv
 "output_format": {
                                "type": "news",
                                "name": "csv",
                                "header": "\"pubdate\",\"title\",\"description\",\"media\",\"author\",\"dc_date\",\"link\",\"keywords\",\"content_encoded\",\"html_lang\",\"pubdate_extractor\",\"title_extractor\",\"description_extractor\",\"media_extractor\",\"author_extractor\",\"dc_date_extractor\",\"link_extractor\",\"keywords_extractor\",\"content_encoded_extractor\",\"html_lang_extractor\",\"crawler_time\",\"scraper_time\",\"errors_mask\"\n",
                                "items_header": "",
                                "item": "\"%pubdate%\",\"%title%\",\"%description%\",\"%media%\",\"%author%\",\"%dc_date%\",\"%link%\",\"%keywords%\",\"%content_encoded%\"\n,\"%html_lang%\",\"%pubdate_extractor%\",\"%title_extractor%\",\"%description_extractor%\",\"%media_extractor%\",\"%author_extractor%\",\"%dc_date_extractor%\",\"%link_extractor%\",\"%keywords_extractor%\",\"%content_encoded_extractor%\"\n,\"%html_lang_extractor%\",\"%crawler_time%\",\"%scraper_time%\",\"%errors_mask%\"\n",
                                "items_footer": "",
                                "footer": ""
									}
text
 "output_format": {
                                "type": "news",
                                "name": "text",
                                "header": "",
                                "items_header": "",
                                "item": "pubdate: %pubdate%\ntitle: %title%\ndescription: %description%\nmedia: %media%\nauthor: %author%\ndc_date: %dc_date%\nlink: %link%\nkeywords: %keywords%\ncontent_encoded: %content_encoded%\nhtml_lang: %html_lang%\npubdate_extractor: %pubdate_extractor%\ntitle_extractor: %title_extractor%\ndescription_extractor: %description_extractor%\nmedia_extractor: %media_extractor%\nauthor_extractor: %author_extractor%\ndc_date_extractor: %dc_date_extractor%\nlink_extractor: %link_extractor%\nkeywords_extractor: %keywords_extractor%\ncontent_encoded_extractor: %content_encoded_extractor%\nhtml_lang_extractor: %html_lang_extractor%\ncrawler_time: %crawler_time%\nscraper_time: %scraper_time%\nerrors_mask: %errors_mask%\n",
                                "items_footer": "",
                                "footer": ""
								}
SQL
"output_format": {
                                "type": "news",
                                "name": "sql",
                                "header": "INSERT INTO my_table (pubdate,title,description,media,author,dc_date,link,keywords,content_encoded,html_lang,pubdate_extractor,title_extractor,description_extractor,media_extractor,author_extractor,dc_date_extractor,link_extractor,keywords_extractor,content_encoded_extractor,html_lang_extractor,crawler_time,scraper_time,errors_mask) VALUES \n",
                                "items_header": "",
                                "item": "(\"%pubdate%\",\"%title%\",\"%description%\",\"%media%\",\"%author%\",\"%dc_date%\",\"%link%\",\"%keywords%\",\"%content_encoded%\",\"%html_lang%\",\"%pubdate_extractor%\",\"%title_extractor%\",\"%description_extractor%\",\"%media_extractor%\",\"%author_extractor%\",\"%dc_date_extractor%\",\"%link_extractor%\",\"%keywords_extractor%\",\"%content_encoded_extractor%\",\"%html_lang_extractor%\",\"%crawler_time%\",\"%scraper_time%\",\"%errors_mask%\")",
                                "items_footer": "",
                                "footer": ";\n"
                            }
xml
"output_format": {
                                "type": "news",
                                "name": "xml",
                                "header": "<?xml version=\"1.0\"?>\n<response>\n",
                                "items_header": "  <item>\n",
                                "item": "    <pubdate>%pubdate%<\/pubdate>\n    <title><![CDATA[%title%]]><\/title>\n    <description><![CDATA[%description%]]><\/description>\n    <media><![CDATA[%media%]]><\/media>\n    <author><![CDATA[%author%]]><\/author>\n    <dc_date><![CDATA[%dc_date%]]><\/dc_date>\n    <link><![CDATA[%link%]]><\/link>\n    <keywords><![CDATA[%keywords%]]><\/keywords>\n    <content_encoded><![CDATA[%content_encoded%]]><\/content_encoded>\n    <html_lang><![CDATA[%html_lang%]]><\/html_lang>\n    <pubdate_extractor>%pubdate_extractor%<\/pubdate_extractor>\n    <title_extractor>%title_extractor%<\/title_extractor>\n    <description_extractor>%description_extractor%<\/description_extractor>\n    <media_extractor>%media_extractor%<\/media_extractor>\n    <author_extractor>%author_extractor%<\/author_extractor>\n    <dc_date_extractor>%dc_date_extractor%<\/dc_date_extractor>\n    <link_extractor>%link_extractor%<\/link_extractor>\n    <keywords_extractor>%keywords_extractor%<\/keywords_extractor>\n    <content_encoded_extractor>%content_encoded_extractor%<\/content_encoded_extractor>\n    <html_lang_extractor>%html_lang_extractor%<\/html_lang_extractor>\n    <crawler_time>%crawler_time%<\/crawler_time>\n    <scraper_time>%scraper_time%<\/scraper_time>\n    <errors_mask>%errors_mask%<\/errors_mask>\n",
                                "items_footer": "  <\/item>\n",
                                "footer": "<\/response>"
                            }

items.properties.template.templates.tags Set of basic tags: you can remove those, in which you are not interested in, but also you must remove corresponding tags also from output_format array. Be careful while edit your json and validate it after all edits;

And here is one more parameter, that we have not reviewed yet: depth. If you want to determine depth and max URLs from it, it will not be enough to change depth only – you’ll have to change several parameters:

maxIterations and items.depth parameters, that define depth for crawler: must have the same values – for example, for depth 2 – both may be ‘2’ and so on;
items.siteObj.filters.subject this is depth for filter, and for deeper crawling it must be the same value as maxIterations and items.depth;
items.siteObj.maxURLsFromPage and items.urlObj.maxURLsFromPage must have the same values – this is how many links from every level you want to process.

© 2015-2016 TagsReaper. All rights reserved.