Skip to main content
POST
/
v1
/
crawl
Create crawl task
curl --request POST \
  --url https://gateway.webit.live/v1/crawl \
  --header 'Content-Type: application/json' \
  --data '
{
  "url": "https://example.com",
  "name": "The best crawl ever",
  "sitemap": "include",
  "crawl_entire_domain": false,
  "limit": 100,
  "max_discovery_depth": 3,
  "exclude_paths": [
    "/exclude-this-path",
    "/and-this-path"
  ],
  "include_paths": [
    "/include-this-path",
    "/and-this-path"
  ],
  "ignore_query_parameters": false,
  "allow_external_links": false,
  "allow_subdomains": false,
  "callback": {
    "url": "https://example.com/webhook",
    "headers": {
      "X-Custom-Header": "value"
    },
    "metadata": {
      "crawlId": "12345"
    },
    "events": [
      "page"
    ]
  },
  "extract_options": {
    "debug_options": {
      "collect_har": true,
      "record_screen": true,
      "verbose": true,
      "trace": true,
      "no_retry_mode": true,
      "upload_engine_logs": true,
      "solve_captcha": true,
      "show_cursor": true,
      "with_proxy_usage": true,
      "redact": true
    },
    "url": "https://example.com/page",
    "cookies": [],
    "parse_options": {
      "merge_dynamic": true
    },
    "parse": true,
    "dynamic_parser": {
      "myParser": {
        "option1": "value1"
      }
    },
    "parser": {
      "myParser": {
        "option1": "value1"
      }
    },
    "type": "generic",
    "method": "GET",
    "referrer_type": "no-referrer",
    "expected_status_codes": [
      200,
      201
    ],
    "headers": {
      "User-Agent": "CustomBot/1.0",
      "Accept-Language": "en-US"
    },
    "raw_headers": true,
    "request_timeout": 30000,
    "client_timeout": 25000,
    "return_response_headers_as_header": true,
    "format": "json",
    "skill": "dynamic-content",
    "http2": true,
    "ip6": false,
    "is_xhr": true,
    "no_html": false,
    "export_userbrowser": false,
    "save_userbrowser": false,
    "native_mode": "requester",
    "driver": "vx8",
    "disable_ip_check": false,
    "template": {
      "name": "<string>",
      "params": {}
    },
    "markdown": false,
    "consent_header": true,
    "skip_ubct": false,
    "userbrowser_creation_template_rendered": {
      "id": "<string>",
      "allowed_parameter_names": [
        "<string>"
      ],
      "render_flow_rendered": [
        {}
      ]
    },
    "query_template": {
      "id": "3c90c3cc-0d44-4b50-8888-8dd25736052a",
      "api_type": "WEB",
      "params": {},
      "pagination": {
        "next_page_params": {}
      }
    },
    "render": true,
    "render_options": {
      "wait_until": "networkidle2",
      "render_type": "load",
      "headless": true,
      "timeout": 30000,
      "userbrowser": true,
      "connector_type": "webit-cdp",
      "hackium_configuration": {
        "collect_logs": false,
        "enable_verbose_logs": false,
        "enable_sniffer": false,
        "do_not_fix_math_salt": false,
        "enable_document_element_spoof": false,
        "enable_key_ordering": false,
        "enable_document_has_focus": false,
        "enable_fake_navigation_history": false
      },
      "include_iframes": true,
      "browser_engine": "chrome",
      "fingerprint_id": "fp-abc123",
      "disabled_resources": [
        "image",
        "stylesheet"
      ],
      "adblock": true,
      "cache": false,
      "blocked_domains": [
        "ads.example.com",
        "tracker.com"
      ],
      "with_performance_metrics": true,
      "no_accept_encoding": true,
      "override_permissions": true,
      "store_local_storage": true,
      "load_local_storage": true,
      "local_storage_keys_to_load": [
        "authToken",
        "userId"
      ],
      "enable_2captcha": true,
      "mouse_strategy": "linear",
      "typing_strategy": "simple",
      "typing_interval": 100,
      "random_header_order": true,
      "extensions": [
        "extension-id-1",
        "extension-id-2"
      ]
    },
    "network_capture": [
      {
        "status_code": 349.5,
        "method": "GET",
        "url": {
          "value": "<string>",
          "type": "exact"
        },
        "resource_type": [
          "document",
          "script",
          "xhr",
          "fetch"
        ],
        "validation": false,
        "wait_for_requests_count": 0,
        "wait_for_requests_count_timeout": 150000
      }
    ],
    "render_flow": [
      {
        "wait": {
          "delay": 2000
        }
      },
      {
        "click": {
          "selector": "#load-more",
          "timeout": 5000
        }
      }
    ],
    "session": {
      "id": "<string>",
      "timeout": 1,
      "retry": false,
      "prefetch_userbrowser": false
    },
    "tag": "campaign-2024-q1",
    "metadata": {
      "source": "web-app",
      "pipeline_execution_id": 12345,
      "execution_id": "exec-abc123",
      "endpoint": "/api/v2/scrape",
      "definition_id": 456,
      "definition_name": "product-scraper",
      "template_id": 789,
      "template_name": "e-commerce-template",
      "account_name": "acme-corp",
      "flowit_task_id": "task-xyz789",
      "input_id": "input-123",
      "query_template_id": "template-qry-001"
    },
    "locale": "en-US",
    "country": "US",
    "device": "desktop",
    "proxy_provider": "brightdata",
    "proxy_providers": {
      "brightdata": 70,
      "oxylabs": 30
    },
    "browser": "chrome",
    "os": "windows",
    "no_userbrowser": false,
    "state": "CA",
    "city": "Los Angeles"
  }
}
'
{
  "id": "3c90c3cc-0d44-4b50-8888-8dd25736052a",
  "url": "<string>"
}

Body

application/json
url
string<uri>
required

Url to crawl.

Example:

"https://example.com"

name
string

Name of the crawl.

Example:

"The best crawl ever"

sitemap
enum<string>
default:include

Sitemap and other methods will be used together to find URLs.

Available options:
skip,
include,
only
Example:

"include"

crawl_entire_domain
boolean
default:false

Allows the crawler to follow internal links to sibling or parent URLs, not just child paths.

Example:

false

limit
integer
default:5000

Maximum number of pages to crawl.

Required range: 1 <= x <= 10000
Example:

100

max_discovery_depth
integer
default:5

Maximum depth to crawl based on discovery order.

Required range: 1 <= x <= 20
Example:

3

exclude_paths
string[]

URL pathname regex patterns that exclude matching URLs from the crawl.

Example:
["/exclude-this-path", "/and-this-path"]
include_paths
string[]

URL pathname regex patterns that include matching URLs in the crawl.

Example:
["/include-this-path", "/and-this-path"]
ignore_query_parameters
boolean
default:false

Do not re-scrape the same path with different (or none) query parameters.

Example:

false

Allows the crawler to follow links to external websites.

Example:

false

allow_subdomains
boolean
default:false

Allows the crawler to follow links to subdomains of the main domain.

Example:

false

callback

Webhook configuration for receiving crawl results.

extract_options
object

Response

Successful Response

id
string<uuid>
required
url
string<uri>
required