File: examples.md | Updated: 11/18/2025
This page contains additional code examples for working with warcio.
Iterate over all records in a WARC file:
from warcio.archiveiterator import ArchiveIterator
with open('path/to/file.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
print(f"Type: {record.rec_type}")
print(f"Headers: {record.rec_headers}")
Print URLs of all response records:
from warcio.archiveiterator import ArchiveIterator
with open('path/to/file.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
uri = record.rec_headers.get_header('WARC-Target-URI')
print(uri)
Extract and print HTML content from response records:
from warcio.archiveiterator import ArchiveIterator
with open('path/to/file.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
content_type = record.http_headers.get_header('Content-Type')
if content_type and 'text/html' in content_type:
uri = record.rec_headers.get_header('WARC-Target-URI')
html_content = record.content_stream().read()
print(f"URL: {uri}")
print(f"HTML: {html_content[:200]}...") # First 200 bytes
print()
Stream and process WARC files from a remote URL:
import requests
from warcio.archiveiterator import ArchiveIterator
url = 'https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.warc.gz'
resp = requests.get(url, stream=True)
for record in ArchiveIterator(resp.raw):
if record.rec_type == 'response':
print(record.rec_headers.get_header('WARC-Target-URI'))
Read ARC files and convert them to WARC format on the fly:
import requests
from warcio.archiveiterator import ArchiveIterator
def print_records(url):
resp = requests.get(url, stream=True)
for record in ArchiveIterator(resp.raw, arc2warc=True):
if record.rec_type == 'warcinfo':
print(record.raw_stream.read())
elif record.rec_type == 'response':
if record.http_headers.get_header('Content-Type') == 'text/html':
print(record.rec_headers.get_header('WARC-Target-URI'))
print(record.content_stream().read())
print('')
# Read ARC file with conversion
print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.arc.gz')
Capture a single HTTP request to a WARC file:
from warcio.capture_http import capture_http
import requests
with capture_http('example.warc.gz'):
requests.get('https://example.com/')
Capture multiple HTTP requests to a single WARC file:
from warcio.capture_http import capture_http
import requests
with capture_http('multiple.warc.gz'):
requests.get('https://example.com/')
requests.get('https://www.iana.org/')
requests.post('https://httpbin.org/post', data={'key': 'value'})
Write to an in-memory buffer instead of a file:
from warcio.capture_http import capture_http
from warcio.archiveiterator import ArchiveIterator
import requests
with capture_http() as writer:
requests.get('https://example.com/')
# Read back the records from the buffer
for record in ArchiveIterator(writer.get_stream()):
print(f"{record.rec_type}: {record.rec_headers.get_header('WARC-Target-URI')}")
Create WARC files using the WARC 1.1 standard:
from warcio.capture_http import capture_http
import requests
with capture_http('example11.warc.gz', warc_version='1.1'):
requests.get('https://example.com/')
Filter which requests/responses to save:
from warcio.capture_http import capture_http
import requests
def filter_records(request, response, request_recorder):
# Only save successful responses (200 status)
if response.http_headers.get_statuscode() != '200':
return None, None
return request, response
with capture_http('filtered.warc.gz', filter_records):
requests.get('https://example.com/')
requests.get('https://httpbin.org/status/404') # This will be skipped
Write WARC records manually with full control:
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
import requests
with open('manual.warc.gz', 'wb') as output:
writer = WARCWriter(output, gzip=True)
# Fetch content
resp = requests.get('http://example.com/',
headers={'Accept-Encoding': 'identity'},
stream=True)
# Get raw headers from urllib3
headers_list = resp.raw.headers.items()
http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
# Create and write WARC record
record = writer.create_warc_record('http://example.com/', 'response',
payload=resp.raw,
http_headers=http_headers)
writer.write_record(record)
Use a custom WARCWriter instance with capture_http:
from warcio.capture_http import capture_http
from warcio import WARCWriter
import requests
with open('custom.warc.gz', 'wb') as fh:
warc_writer = WARCWriter(fh, gzip=True)
with capture_http(warc_writer):
requests.get('https://example.com/')
requests.get('https://www.iana.org/')
Extract all unique URLs from a WARC file:
from warcio.archiveiterator import ArchiveIterator
urls = set()
with open('path/to/file.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type in ('response', 'request'):
uri = record.rec_headers.get_header('WARC-Target-URI')
if uri:
urls.add(uri)
for url in sorted(urls):
print(url)
Count the number of each record type in a WARC file:
from warcio.archiveiterator import ArchiveIterator
from collections import Counter
record_types = Counter()
with open('path/to/file.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
record_types[record.rec_type] += 1
for rec_type, count in record_types.items():
print(f"{rec_type}: {count}")
Save all images from a WARC file:
from warcio.archiveiterator import ArchiveIterator
import os
output_dir = 'extracted_images'
os.makedirs(output_dir, exist_ok=True)
with open('path/to/file.warc.gz', 'rb') as stream:
for i, record in enumerate(ArchiveIterator(stream)):
if record.rec_type == 'response':
content_type = record.http_headers.get_header('Content-Type')
if content_type and content_type.startswith('image/'):
uri = record.rec_headers.get_header('WARC-Target-URI')
ext = content_type.split('/')[-1].split(';')[0]
filename = f"{output_dir}/image_{i}.{ext}"
with open(filename, 'wb') as img_file:
img_file.write(record.content_stream().read())
print(f"Saved: {filename} from {uri}")