/scrape API
The /scrape API extracts structured JSON data from pages using CSS selectors. Requires url and elements array with selectors.
You can check the full Open API schema here.
Quick Start
- cURL
- Javascript
- Python
- Java
- C#
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://browserless.io/",
"elements": [
{
"selector": "h1"
}
]
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/scrape?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://browserless.io/",
elements: [
{ selector: "h1" }
]
};
const scrapeContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
scrapeContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/scrape?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://browserless.io/",
"elements": [
{ "selector": "h1" }
]
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class ScrapeContent {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/scrape?token=" + TOKEN;
String jsonData = """
{
"url": "https://browserless.io/",
"elements": [
{ "selector": "h1" }
]
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/scrape?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://browserless.io/"",
""elements"": [
{ ""selector"": ""h1"" }
]
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}
Response
{
"data": [
{
"results": [
{
"attributes": [
{ "name": "class", "value": "..." }
],
"height": 120,
"html": "Headless browser automation, without the hosting headaches",
"left": 32,
"text": "Headless browser automation, without the hosting headaches",
"top": 196,
"width": 736
}
],
"selector": "h1"
}
]
}
Additional Details
We recommended using BrowserQL, Browserless' first-class browser automation API, to scrape content from any website.
The API uses document.querySelectorAll to retrieve all matches on a page. Using a more specific selector can narrow down the returned results. The default behavior is to navigate to the URL specified, wait for the page to load (including parsing and executing of JavaScript), then waiting for the elements for a maximum of 30 seconds.
Specifying Page-Load Behavior
The scrape API allows for setting specific page-load behaviors by setting a gotoOptions in the JSON body. This is passed directly into puppeteer's goto() method.
In the example below, we'll set a waitUntil property and a timeout.
- cURL
- Javascript
- Python
- Java
- C#
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://example.com/",
"elements": [
{
"selector": "h1"
}
],
"gotoOptions": {
"timeout": 10000,
"waitUntil": "networkidle2"
}
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/scrape?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://example.com/",
elements: [
{ selector: "h1" }
],
gotoOptions: {
timeout: 10000,
waitUntil: "networkidle2"
}
};
const scrapeContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
scrapeContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/scrape?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"gotoOptions": {
"timeout": 10000,
"waitUntil": "networkidle2"
}
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class ScrapeContentWithOptions {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/scrape?token=" + TOKEN;
String jsonData = """
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"gotoOptions": {
"timeout": 10000,
"waitUntil": "networkidle2"
}
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/scrape?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://example.com/"",
""elements"": [
{ ""selector"": ""h1"" }
],
""gotoOptions"": {
""timeout"": 10000,
""waitUntil"": ""networkidle2""
}
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}
Custom behavior with waitFor options
Sometimes it's helpful to do further actions, or wait for custom events on the page before getting data. We allow this behavior with the waitFor properties.
waitForTimeout
Waits for the given number of milliseconds before continue execution.
- cURL
- Javascript
- Python
- Java
- C#
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://example.com/",
"elements": [
{
"selector": "h1"
}
],
"waitForTimeout": 1000
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/scrape?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://example.com/",
elements: [
{ selector: "h1" }
],
waitForTimeout: 1000
};
const scrapeContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
scrapeContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/scrape?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForTimeout": 1000
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class ScrapeContentWithTimeout {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/scrape?token=" + TOKEN;
String jsonData = """
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForTimeout": 1000
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/scrape?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://example.com/"",
""elements"": [
{ ""selector"": ""h1"" }
],
""waitForTimeout"": 1000
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}
waitForSelector
Wait for a selector to appear in page. If at the moment of calling the method the selector already exists, the method will return immediately. If the selector doesn't appear after the timeout milliseconds of waiting, the function will throw an exception.
Example
- cURL
- Javascript
- Python
- Java
- C#
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://example.com/",
"elements": [
{
"selector": "h1"
}
],
"waitForSelector": {
"selector": "h1",
"timeout": 5000
}
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/scrape?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://example.com/",
elements: [
{ selector: "h1" }
],
waitForSelector: {
selector: "h1",
timeout: 5000
}
};
const scrapeContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
scrapeContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/scrape?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForSelector": {
"selector": "h1",
"timeout": 5000
}
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class ScrapeContentWithWaitForSelector {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/scrape?token=" + TOKEN;
String jsonData = """
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForSelector": {
"selector": "h1",
"timeout": 5000
}
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/scrape?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://example.com/"",
""elements"": [
{ ""selector"": ""h1"" }
],
""waitForSelector"": {
""selector"": ""h1"",
""timeout"": 5000
}
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}
waitForFunction
Waits for the provided function to return before cotinue. The function can be any valid JS function, including async functions.
Example
JS function
async () => {
const res = await fetch('https://jsonplaceholder.typicode.com/todos/1');
const json = await res.json();
document.querySelector("h1").innerText = json.title;
}
- JSON payload
- cURL
- Javascript
- Python
- Java
- C#
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForFunction": {
"fn": "async()=>{let t=await fetch('https://jsonplaceholder.typicode.com/todos/1'),e=await t.json();document.querySelector('h1').innerText=e.title}",
"timeout": 5000
}
}
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://example.com/",
"elements": [
{
"selector": "h1"
}
],
"waitForFunction": {
"fn": "async()=>{let t=await fetch('\''https://jsonplaceholder.typicode.com/todos/1'\''),e=await t.json();document.querySelector('\''h1'\'').innerText=e.title}",
"timeout": 5000
}
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/scrape?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://example.com/",
waitForFunction: {
fn: "async()=>{let t=await fetch('https://jsonplaceholder.typicode.com/todos/1'),e=await t.json();document.querySelector('h1').innerText=e.title}",
timeout: 5000
}
};
const scrapeContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
scrapeContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/scrape?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://example.com/",
"waitForFunction": {
"fn": "async()=>{let t=await fetch('https://jsonplaceholder.typicode.com/todos/1'),e=await t.json();document.querySelector('h1').innerText=e.title}",
"timeout": 5000
}
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class ScrapeContentWithWaitForFunction {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/scrape?token=" + TOKEN;
String jsonData = """
{
"url": "https://example.com/",
"waitForFunction": {
"fn": "async()=>{let t=await fetch('https://jsonplaceholder.typicode.com/todos/1'),e=await t.json();document.querySelector('h1').innerText=e.title}",
"timeout": 5000
}
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/scrape?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://example.com/"",
""waitForFunction"": {
""fn"": ""async()=>{let t=await fetch('https://jsonplaceholder.typicode.com/todos/1'),e=await t.json();document.querySelector('h1').innerText=e.title}"",
""timeout"": 5000
}
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}
waitForEvent
Waits for an event to happen on the page before cotinue.
Example
- JSON payload
- cURL
- Javascript
- Python
- Java
- C#
// Will fail since the event never fires
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForEvent": {
"event": "fullscreenchange",
"timeout": 5000
}
}
curl --request POST \
--url 'https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE' \
--header 'content-type: application/json' \
--data '{
"url": "https://example.com/",
"elements": [
{
"selector": "h1"
}
],
"waitForEvent": {
"event": "fullscreenchange",
"timeout": 5000
}
}'
const TOKEN = "YOUR_API_TOKEN_HERE";
const url = `https://production-sfo.browserless.io/content?token=${TOKEN}`;
const headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
};
const data = {
url: "https://example.com/",
elements: [
{ selector: "h1" }
],
waitForEvent: {
event: "fullscreenchange",
timeout: 5000
}
};
const fetchContent = async () => {
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
const result = await response.json();
console.log(result);
};
fetchContent();
import requests
TOKEN = "YOUR_API_TOKEN_HERE"
url = f"https://production-sfo.browserless.io/content?token={TOKEN}"
headers = {
"Cache-Control": "no-cache",
"Content-Type": "application/json"
}
data = {
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForEvent": {
"event": "fullscreenchange",
"timeout": 5000
}
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)
import java.io.*;
import java.net.URI;
import java.net.http.*;
public class FetchContentWithWaitForEvent {
public static void main(String[] args) {
String TOKEN = "YOUR_API_TOKEN_HERE";
String url = "https://production-sfo.browserless.io/content?token=" + TOKEN;
String jsonData = """
{
"url": "https://example.com/",
"elements": [
{ "selector": "h1" }
],
"waitForEvent": {
"event": "fullscreenchange",
"timeout": 5000
}
}
""";
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Cache-Control", "no-cache")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonData))
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println("Response: " + response.body());
} catch (Exception e) {
e.printStackTrace();
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class Program {
static async Task Main(string[] args) {
string TOKEN = "YOUR_API_TOKEN_HERE";
string url = $"https://production-sfo.browserless.io/content?token={TOKEN}";
string jsonData = @"
{
""url"": ""https://example.com/"",
""elements"": [
{ ""selector"": ""h1"" }
],
""waitForEvent"": {
""event"": ""fullscreenchange"",
""timeout"": 5000
}
}";
using var client = new HttpClient();
var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
try {
var response = await client.PostAsync(url, content);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
Console.WriteLine("Response: " + result);
} catch (Exception ex) {
Console.WriteLine($"Error: {ex.Message}");
}
}
}