Request json structure for template

Example json for URL, template based:

{
    "id": 1446930914,
    "crawlerType": 4,
    "maxIterations": "1",
    "info": {
        "description": "Site huffingtonpost.de",
        "templates": {
            "names": [
                "huffingtonpost.de"
            ]
        }
    },
    "items": [
        {
            "siteId": "0",
            "urlContentResponse": null,
            "depth": "1",
            "siteObj": {
                "fetchType": 1,
                "id": "0",
                "uDate": null,
                "tcDate": null,
                "cDate": null,
                "resources": null,
                "iterations": null,
                "description": null,
                "urls": [
                ],
                "filters": [
                    {
                        "pattern": "http(.*)",
                        "siteId": "0",
                        "type": 1
                    },
                    {
                        "mode": 0,
                        "pattern": "%MAX_DEPTH%",
                        "subject": "1",
                        "siteId": "0",
                        "type": 1,
                        "opCode": 4,
                        "stage": 0,
                        "action": 1
                    }
                ],
                "properties": {
                },
                "state": null,
                "priority": null,
                "maxURLs": null,
                "maxResources": null,
                "maxErrors": null,
                "maxResourceSize": null,
                "requestDelay": null,
                "httpTimeout": null,
                "errorMask": null,
                "errors": null,
                "urlType": null,
                "contents": null,
                "processingDelay": null,
                "size": null,
                "avgSpeed": null,
                "avgSpeedCounter": null,
                "userId": null,
                "recrawlPeriod": null,
                "recrawlDate": null,
                "maxURLsFromPage": null,
                "collectedURLs": null
            },
            "urlObj": {
                "status": 2,
                "linksI": 0,
                "linksE": 0,
                "contentMask": 0,
                "processingTime": 0,
                "CDate": null,
                "mRateCounter": 0,
                "httpTimeout": 10000,
                "size": 0,
                "urlPut": null,
                "batchId": 0,
                "lastModified": null,
                "tagsCount": 0,
                "mRate": 0,
                "charset": "",
                "state": 0,
                "httpCode": 0,
                "priority": 0,
                "maxURLsFromPage": null,
                "processingDelay": 0,
                "crawlingTime": 0,
                "type": 1,
                "processed": 0,
                "totalTime": 0,
                "siteSelect": 0,
                "contentType": "",
                "pDate": null,
                "errorMask": 0,
                "httpMethod": "get",
                "eTag": "",
                "siteId": "0",
                "freq": 0,
                "tcDate": null,
                "rawContentMd5": "",
                "crawled": 0,
                "UDate": null,
                "contentURLMd5": "",
                "requestDelay": 0,
                "depth": 0,
                "parentMd5": "",
                "urlUpdate": null,
                "tagsMask": 0,
                "urlMd5": "fc727b7e98a47c0e803da0380ad88159",
                "url": "http:\/\/www.huffingtonpost.de\/2015\/09\/01\/islaender-wollen-privat-fluechtlinge-aufnehmen_n_8069648.html?utm_hp_ref=germany"
            },
            "urlPutObj": {
                "putDict": {
                },
                "urlMd5": "fc727b7e98a47c0e803da0380ad88159",
                "contentType": 0,
                "siteId": "0",
                "fileStorageSuffix": null,
                "criterions": null
            },
            "properties": {
                "DB_TASK_MODE": "RO",
                "HTTP_REDIRECTS_MAX": 5,
                "HTML_REDIRECTS_MAX": 5,
                "HTML_RECOVER": "0",
                "ROBOTS_MODE": "0",
                "URL_CHAIN": null,
                "template": {
                    "templates": [
                        {
                            "priority": "0",
                            "mandatory": 0,
                            "state": 1,
                            "is_filled": 0,
                            "output_format": {
                                "type": "template",
                                "name": "json",
                                "header": "[\n",
                                "items_header": "",
                                "item": "{\n\"keywords\":\"%keywords%\",\n\"description\":\"%description%\",\n\"body\":\"%body%\",\n\"link\":\"%link%\",\n\"image\":\"%image%\",\n\"author\":\"%author%\",\n\"pubdate\":\"%pubdate%\",\n\"title\":\"%title%\",\n\"keywords_extractor\":\"%keywords_extractor%\",\n\"description_extractor\":\"%description_extractor%\",\n\"body_extractor\":\"%body_extractor%\",\n\"link_extractor\":\"%link_extractor%\",\n\"image_extractor\":\"%image_extractor%\",\n\"author_extractor\":\"%author_extractor%\",\n\"pubdate_extractor\":\"%pubdate_extractor%\",\n\"title_extractor\":\"%title_extractor%\",\n\"crawler_time\":\"%crawler_time%\",\n\"scraper_time\":\"%scraper_time%\",\n\"errors_mask\":\"%errors_mask%\"\n}\n",
                                "items_footer": "",
                                "footer": "]\n"
                            },
                            "tags": {
                                "title": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/*[@class='title']",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "text",
                                        "format": ""
                                    }
                                ],
                                "pubdate": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/*[@class='posted']\/time",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "datetime",
                                        "format": ""
                                    }
                                ],
                                "author": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/*[@class='name fn']\/a",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "text",
                                        "format": ""
                                    }
                                ],
                                "image": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/*[@class='img-caption']\/img",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 1,
                                        "mandatory": 0,
                                        "type": "image",
                                        "format": "URL"
                                    }
                                ],
                                "link": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/link[@rel='canonical']",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 1,
                                        "mandatory": 0,
                                        "type": "link",
                                        "format": ""
                                    }
                                ],
                                "body": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/\/*[@id='mainentrycontent']\/\/p",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "text",
                                        "format": ""
                                    }
                                ],
                                "description": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/html\/head\/meta[@name='description']\/@content",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "text",
                                        "format": ""
                                    }
                                ],
                                "keywords": [
                                    {
                                        "default": "",
                                        "begin": "",
                                        "end": "",
                                        "target": "\/html\/head\/meta[@name='keywords']\/@content",
                                        "postProcessing": "",
                                        "join": "concat",
                                        "delimiter": "",
                                        "canonicalizeURLs": 0,
                                        "mandatory": 0,
                                        "type": "text",
                                        "format": ""
                                    }
                                ]
                            }
                        }
                    ],
                    "select": "first_good"
                },
                "PROCESSOR_PROPERTIES": "{\"algorithm\":{\"algorithm_name\":\"regular\"},\"modules\":{\"regular\":[\"ScrapyExtractor\"]},\"SCRAPER_DOWNLOAD_IMAGES\":1,\"SCRAPER_TEXT_MARKUP\":{\"DIV\":\"\\n\",\"P\":\"\\n\",\"H\":\"\\n\",\"TR\":\"\\n\"}}"
            },
            "urlId": "fc727b7e98a47c0e803da0380ad88159"
        }
    ]
}

General parameters, the same for all types of requested json, were consider here.
As you can see, in this json the changes affected mainly properties.templates section and properties.PROCESSOR_PROPERTIES – here is change ‘algorithm_name’ from ‘user_name_algorithm’ to ‘regular’ and ScrapyExtractor left only:

"PROCESSOR_PROPERTIES": "{
    \"algorithm\": {
        \"algorithm_name\": \"regular\"
    },
    \"modules\": {
        \"regular\": [
            \"ScrapyExtractor\"
        ]
    },
    \"SCRAPER_DOWNLOAD_IMAGES\": 1,
    \"SCRAPER_TEXT_MARKUP\": {
        \"DIV\": \"\\n\",
        \"P\": \"\\n\",
        \"H\": \"\\n\",
        \"TR\": \"\\n\"
    }"

item.properties.templates has undergone significant changes because of redefinition requested tags. So, we need very carefully add all our tags (that define in items.properties.template.templates.tags) in items.properties.template.templates.output_format.item array.
items.properties.template.templates.tags array contains all tag, that we want to select by our own rules. How it works:

 "keywords" - here is tags name: [
                                    {
                                        "default": "", here is default value, that will be return if xpath not select any content
                                        "begin": "",
                                        "end": "",
                                        "target": "\/html\/head\/meta[@name='keywords']\/@content", here is the main parameter: xpath for your tag
                                        "postProcessing": "", here you can enter regex for post processing: for example, if your tag returns with some unnecessary content and you want to delete it
                                        "join": "concat", here is join parameter: if you have two rules for one tag, than you can choose behaviour of its join: wood they concatenate, or you want get first not empty rule, or best (by lenght). Corresponding values for this parameter is: "concat", "first" or "best"
                                        "delimiter": "", sure, it delimiter in case if you have several results from your xpath
                                        "canonicalizeURLs": 0, some sites have internal links and this mark allow to get full URL: "0" means not canonicalize, "1" - recover URL to full 
                                        "mandatory": 0, 
                                        "type": "text", format of result: "text", "link", "datetime", "html", "image"
                                        "format": ""
                                    }
                                ]

You can add so much tags, as you need, but don’t forget to add it items.properties.template.templates.output_format.item. Rules for it compilation not change, you can view examples in article for news or RSS jsons.

© 2015-2016 TagsReaper. All rights reserved.