Metadata Extraction Guide v2.13.0¶
This guide shows how to extract structured metadata from HTML documents during conversion using the convert_with_metadata() API.
Overview¶
The metadata API returns both the converted Markdown and a structured metadata object in a single call. Metadata extraction happens during the same traversal pass as conversion, so there is minimal overhead.
For background on what metadata is extracted and how it works, see the Metadata Extraction concept page.
Basic Usage¶
from html_to_markdown import convert_with_metadata, MetadataConfig
metadata_config = MetadataConfig(
extract_headers=True,
extract_links=True,
extract_images=True,
extract_structured_data=True,
max_structured_data_size=100000,
)
markdown, metadata = convert_with_metadata(html, metadata_config=metadata_config)
import { convertWithMetadata } from '@kreuzberg/html-to-markdown';
const result = convertWithMetadata('<h1>Title</h1><p>Content</p>');
const { markdown, metadata } = result;
console.log(markdown); // Converted markdown
console.log(metadata.document); // Document metadata (title, description, etc.)
console.log(metadata.headers); // Header elements (h1-h6)
console.log(metadata.links); // Extracted links
console.log(metadata.images); // Extracted images
use HtmlToMarkdown\Config\ConversionOptions;
use HtmlToMarkdown\Service\Converter;
use function HtmlToMarkdown\convert_with_metadata;
$html = '<html><head><title>Example</title></head><body><h1>Welcome</h1><a href="https://example.com">Link</a></body></html>';
// Object-oriented API
$converter = Converter::create();
$result = $converter->convertWithMetadata(
$html,
new ConversionOptions(headingStyle: 'Atx'),
[
'extract_headers' => true,
'extract_links' => true,
'extract_images' => true,
]
);
echo $result['markdown'];
echo $result['metadata']->document->title;
foreach ($result['metadata']->links as $link) {
echo $link->href . ': ' . $link->text;
}
// Procedural API
$result = convert_with_metadata(
$html,
new ConversionOptions(headingStyle: 'Atx'),
['extract_headers' => true, 'extract_links' => true]
);
#include "html_to_markdown.h"
#include <stdio.h>
int main(void) {
const char *html = "<html><head><title>Page</title></head>"
"<body><h1>Hello</h1></body></html>";
char *result_json = html_to_markdown_convert_with_metadata_with_len(
html, strlen(html), NULL, 0);
if (result_json) {
/* result_json is a JSON string with "markdown" and "metadata" fields */
printf("%s\n", result_json);
html_to_markdown_free_string(result_json);
}
return 0;
}
html = """
<html>
<head><title>Example</title></head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com">Example link</a>
</body>
</html>
"""
{:ok, markdown, metadata} = HtmlToMarkdown.convert_with_metadata(html)
metadata["document"]["title"] # "Example"
metadata["headers"] |> hd() |> Map.get("text") # "Welcome"
library(htmltomarkdown)
html <- '
<html>
<head><title>Example</title></head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com">Example link</a>
</body>
</html>'
result <- convert_with_metadata(html)
cat(result$markdown)
result$metadata$document$title # "Example"
result$metadata$headers[[1]]$text # "Welcome"
use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
let html = r#"
<html lang="en">
<head><title>Example</title></head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com">Example link</a>
</body>
</html>
"#;
let config = MetadataConfig::default();
let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
println!("Title: {:?}", metadata.document.title);
println!("Headers: {:?}", metadata.headers);
println!("Links: {:?}", metadata.links);
Configuring Metadata Extraction¶
Control which categories of metadata are extracted using MetadataConfig:
from html_to_markdown import convert_with_metadata, MetadataConfig
# Extract only headers and links, skip everything else
config = MetadataConfig(
extract_document=False,
extract_headers=True,
extract_links=True,
extract_images=False,
extract_structured_data=False,
)
markdown, metadata = convert_with_metadata(html, metadata_config=config)
let config = MetadataConfig {
extract_document: false,
extract_headers: true,
extract_links: true,
extract_images: false,
extract_structured_data: false,
max_structured_data_size: 0,
};
let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
assert!(metadata.images.is_empty()); // Not extracted
Performance benefit
Disabling extraction categories you do not need skips the corresponding collection logic entirely. This is particularly beneficial when processing large documents with many links or images that you do not need to catalog.
Working with Document Metadata¶
Document-level metadata comes from <head> tags and the <html> element attributes:
markdown, metadata = convert_with_metadata(html)
doc = metadata.document # or metadata["document"] depending on binding
# Basic fields
print(doc.title) # From <title> tag
print(doc.description) # From <meta name="description">
print(doc.author) # From <meta name="author">
print(doc.language) # From <html lang="...">
print(doc.charset) # From <meta charset="...">
# Open Graph
print(doc.open_graph) # Dict of og:* properties
# {"title": "...", "description": "...", "image": "...", "url": "..."}
# Twitter Card
print(doc.twitter_card) # Dict of twitter:* properties
# {"card": "summary_large_image", "site": "@handle"}
Working with Headers¶
Extracted headers preserve hierarchy and IDs for table-of-contents generation:
markdown, metadata = convert_with_metadata(html)
for header in metadata.headers:
indent = " " * (header.level - 1)
anchor = f"#{header.id}" if header.id else ""
print(f"{indent}H{header.level}: {header.text} {anchor}")
# Output:
# H1: Introduction #intro
# H2: Background #background
# H2: Methodology #methodology
# H3: Data Collection #data-collection
Building a Table of Contents¶
def build_toc(headers):
lines = []
for h in headers:
indent = " " * (h.level - 1)
if h.id:
lines.append(f"{indent}- [{h.text}](#{h.id})")
else:
lines.append(f"{indent}- {h.text}")
return "\n".join(lines)
toc = build_toc(metadata.headers)
Working with Links¶
Links are classified by type for easy filtering:
markdown, metadata = convert_with_metadata(html)
# Filter by type
external = [l for l in metadata.links if l.link_type == "external"]
internal = [l for l in metadata.links if l.link_type == "internal"]
anchors = [l for l in metadata.links if l.link_type == "anchor"]
emails = [l for l in metadata.links if l.link_type == "email"]
# Access link details
for link in external:
print(f" {link.text} -> {link.href}")
print(f" rel: {link.rel}")
Link Types¶
| Type | Pattern | Example |
|---|---|---|
external | Full URL with domain | https://example.com/page |
internal | Relative path | /about, ../contact |
anchor | Fragment-only | #section, #top |
email | mailto: scheme | mailto:user@example.com |
phone | tel: scheme | tel:+1234567890 |
Working with Images¶
markdown, metadata = convert_with_metadata(html)
for img in metadata.images:
print(f" Source: {img.src}")
print(f" Alt: {img.alt}")
print(f" Type: {img.image_type}") # "external", "data_uri", "inline"
Image Types¶
| Type | Description |
|---|---|
external | Standard URL (https://cdn.example.com/img.jpg) |
data_uri | Base64-encoded data URI (data:image/png;base64,...) |
inline | Inline SVG or other embedded content |
Working with Structured Data¶
Structured data extraction captures machine-readable data embedded in HTML:
markdown, metadata = convert_with_metadata(html)
for sd in metadata.structured_data:
print(f" Type: {sd.data_type}") # "json_ld", "microdata", "rdfa"
print(f" Schema: {sd.schema_type}") # e.g., "Product", "Article"
print(f" Content: {sd.content}") # Raw content string
JSON-LD Example¶
For HTML containing:
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "My Article",
"author": {"@type": "Person", "name": "Jane Doe"}
}
</script>
The structured data result contains:
sd = metadata.structured_data[0]
sd.data_type # "json_ld"
sd.schema_type # "Article"
sd.content # The raw JSON string
Size limits
Structured data extraction is limited to max_structured_data_size bytes (default 100,000) to prevent memory exhaustion from very large JSON-LD blocks. Increase this limit if your documents contain large structured data payloads.
Combining Metadata with Conversion Options¶
You can use both ConversionOptions and MetadataConfig together:
from html_to_markdown import (
ConversionOptions,
MetadataConfig,
convert_with_metadata,
)
options = ConversionOptions(
heading_style="atx",
wrap=True,
wrap_width=80,
)
config = MetadataConfig(
extract_headers=True,
extract_links=True,
)
markdown, metadata = convert_with_metadata(
html,
options,
metadata_config=config,
)
use html_to_markdown_rs::{
convert_with_metadata, ConversionOptions, HeadingStyle, MetadataConfig,
};
let options = ConversionOptions {
heading_style: HeadingStyle::Atx,
wrap: true,
wrap_width: 80,
..Default::default()
};
let config = MetadataConfig {
extract_headers: true,
extract_links: true,
..Default::default()
};
let (markdown, metadata) = convert_with_metadata(html, Some(options), config, None)?;
Further Reading¶
- Metadata Extraction Concepts -- architecture and design details
- Configuration Options -- conversion options reference
- Visitor Pattern Guide -- combine visitors with metadata extraction