Skip to content

Metadata Extraction Guide v2.13.0

This guide shows how to extract structured metadata from HTML documents during conversion using the convert_with_metadata() API.


Overview

The metadata API returns both the converted Markdown and a structured metadata object in a single call. Metadata extraction happens during the same traversal pass as conversion, so there is minimal overhead.

For background on what metadata is extracted and how it works, see the Metadata Extraction concept page.


Basic Usage

from html_to_markdown import convert_with_metadata, MetadataConfig

metadata_config = MetadataConfig(
    extract_headers=True,
    extract_links=True,
    extract_images=True,
    extract_structured_data=True,
    max_structured_data_size=100000,
)
markdown, metadata = convert_with_metadata(html, metadata_config=metadata_config)
import { convertWithMetadata } from '@kreuzberg/html-to-markdown';

const result = convertWithMetadata('<h1>Title</h1><p>Content</p>');
const { markdown, metadata } = result;

console.log(markdown);           // Converted markdown
console.log(metadata.document);  // Document metadata (title, description, etc.)
console.log(metadata.headers);   // Header elements (h1-h6)
console.log(metadata.links);     // Extracted links
console.log(metadata.images);    // Extracted images
require 'html_to_markdown'

html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)

puts metadata[:document][:title]     # "Test"
puts metadata[:headers].first[:text] # "Hello"
use HtmlToMarkdown\Config\ConversionOptions;
use HtmlToMarkdown\Service\Converter;
use function HtmlToMarkdown\convert_with_metadata;

$html = '<html><head><title>Example</title></head><body><h1>Welcome</h1><a href="https://example.com">Link</a></body></html>';

// Object-oriented API
$converter = Converter::create();
$result = $converter->convertWithMetadata(
    $html,
    new ConversionOptions(headingStyle: 'Atx'),
    [
        'extract_headers' => true,
        'extract_links' => true,
        'extract_images' => true,
    ]
);

echo $result['markdown'];
echo $result['metadata']->document->title;
foreach ($result['metadata']->links as $link) {
    echo $link->href . ': ' . $link->text;
}

// Procedural API
$result = convert_with_metadata(
    $html,
    new ConversionOptions(headingStyle: 'Atx'),
    ['extract_headers' => true, 'extract_links' => true]
);
#include "html_to_markdown.h"
#include <stdio.h>

int main(void) {
    const char *html = "<html><head><title>Page</title></head>"
                       "<body><h1>Hello</h1></body></html>";

    char *result_json = html_to_markdown_convert_with_metadata_with_len(
        html, strlen(html), NULL, 0);
    if (result_json) {
        /* result_json is a JSON string with "markdown" and "metadata" fields */
        printf("%s\n", result_json);
        html_to_markdown_free_string(result_json);
    }
    return 0;
}
html = """
<html>
  <head><title>Example</title></head>
  <body>
    <h1 id="welcome">Welcome</h1>
    <a href="https://example.com">Example link</a>
  </body>
</html>
"""

{:ok, markdown, metadata} = HtmlToMarkdown.convert_with_metadata(html)

metadata["document"]["title"]        # "Example"
metadata["headers"] |> hd() |> Map.get("text") # "Welcome"
library(htmltomarkdown)

html <- '
<html>
  <head><title>Example</title></head>
  <body>
    <h1 id="welcome">Welcome</h1>
    <a href="https://example.com">Example link</a>
  </body>
</html>'

result <- convert_with_metadata(html)

cat(result$markdown)
result$metadata$document$title       # "Example"
result$metadata$headers[[1]]$text    # "Welcome"
use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};

let html = r#"
  <html lang="en">
    <head><title>Example</title></head>
    <body>
      <h1 id="welcome">Welcome</h1>
      <a href="https://example.com">Example link</a>
    </body>
  </html>
"#;

let config = MetadataConfig::default();
let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;

println!("Title: {:?}", metadata.document.title);
println!("Headers: {:?}", metadata.headers);
println!("Links: {:?}", metadata.links);

Configuring Metadata Extraction

Control which categories of metadata are extracted using MetadataConfig:

from html_to_markdown import convert_with_metadata, MetadataConfig

# Extract only headers and links, skip everything else
config = MetadataConfig(
    extract_document=False,
    extract_headers=True,
    extract_links=True,
    extract_images=False,
    extract_structured_data=False,
)

markdown, metadata = convert_with_metadata(html, metadata_config=config)
import { convertWithMetadata } from '@kreuzberg/html-to-markdown';

const result = convertWithMetadata(html, {
  metadataConfig: {
    extractDocument: false,
    extractHeaders: true,
    extractLinks: true,
    extractImages: false,
    extractStructuredData: false,
  },
});
let config = MetadataConfig {
    extract_document: false,
    extract_headers: true,
    extract_links: true,
    extract_images: false,
    extract_structured_data: false,
    max_structured_data_size: 0,
};

let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
assert!(metadata.images.is_empty()); // Not extracted

Performance benefit

Disabling extraction categories you do not need skips the corresponding collection logic entirely. This is particularly beneficial when processing large documents with many links or images that you do not need to catalog.


Working with Document Metadata

Document-level metadata comes from <head> tags and the <html> element attributes:

markdown, metadata = convert_with_metadata(html)

doc = metadata.document  # or metadata["document"] depending on binding

# Basic fields
print(doc.title)        # From <title> tag
print(doc.description)  # From <meta name="description">
print(doc.author)       # From <meta name="author">
print(doc.language)     # From <html lang="...">
print(doc.charset)      # From <meta charset="...">

# Open Graph
print(doc.open_graph)   # Dict of og:* properties
# {"title": "...", "description": "...", "image": "...", "url": "..."}

# Twitter Card
print(doc.twitter_card) # Dict of twitter:* properties
# {"card": "summary_large_image", "site": "@handle"}

Working with Headers

Extracted headers preserve hierarchy and IDs for table-of-contents generation:

markdown, metadata = convert_with_metadata(html)

for header in metadata.headers:
    indent = "  " * (header.level - 1)
    anchor = f"#{header.id}" if header.id else ""
    print(f"{indent}H{header.level}: {header.text} {anchor}")

# Output:
# H1: Introduction #intro
#   H2: Background #background
#   H2: Methodology #methodology
#     H3: Data Collection #data-collection

Building a Table of Contents

def build_toc(headers):
    lines = []
    for h in headers:
        indent = "  " * (h.level - 1)
        if h.id:
            lines.append(f"{indent}- [{h.text}](#{h.id})")
        else:
            lines.append(f"{indent}- {h.text}")
    return "\n".join(lines)

toc = build_toc(metadata.headers)

Links are classified by type for easy filtering:

markdown, metadata = convert_with_metadata(html)

# Filter by type
external = [l for l in metadata.links if l.link_type == "external"]
internal = [l for l in metadata.links if l.link_type == "internal"]
anchors  = [l for l in metadata.links if l.link_type == "anchor"]
emails   = [l for l in metadata.links if l.link_type == "email"]

# Access link details
for link in external:
    print(f"  {link.text} -> {link.href}")
    print(f"  rel: {link.rel}")
Type Pattern Example
external Full URL with domain https://example.com/page
internal Relative path /about, ../contact
anchor Fragment-only #section, #top
email mailto: scheme mailto:user@example.com
phone tel: scheme tel:+1234567890

Working with Images

markdown, metadata = convert_with_metadata(html)

for img in metadata.images:
    print(f"  Source: {img.src}")
    print(f"  Alt: {img.alt}")
    print(f"  Type: {img.image_type}")  # "external", "data_uri", "inline"

Image Types

Type Description
external Standard URL (https://cdn.example.com/img.jpg)
data_uri Base64-encoded data URI (data:image/png;base64,...)
inline Inline SVG or other embedded content

Working with Structured Data

Structured data extraction captures machine-readable data embedded in HTML:

markdown, metadata = convert_with_metadata(html)

for sd in metadata.structured_data:
    print(f"  Type: {sd.data_type}")    # "json_ld", "microdata", "rdfa"
    print(f"  Schema: {sd.schema_type}")  # e.g., "Product", "Article"
    print(f"  Content: {sd.content}")   # Raw content string

JSON-LD Example

For HTML containing:

<script type="application/ld+json">
{
  "@context": "https://schema.org",
  "@type": "Article",
  "headline": "My Article",
  "author": {"@type": "Person", "name": "Jane Doe"}
}
</script>

The structured data result contains:

sd = metadata.structured_data[0]
sd.data_type    # "json_ld"
sd.schema_type  # "Article"
sd.content      # The raw JSON string

Size limits

Structured data extraction is limited to max_structured_data_size bytes (default 100,000) to prevent memory exhaustion from very large JSON-LD blocks. Increase this limit if your documents contain large structured data payloads.


Combining Metadata with Conversion Options

You can use both ConversionOptions and MetadataConfig together:

from html_to_markdown import (
    ConversionOptions,
    MetadataConfig,
    convert_with_metadata,
)

options = ConversionOptions(
    heading_style="atx",
    wrap=True,
    wrap_width=80,
)

config = MetadataConfig(
    extract_headers=True,
    extract_links=True,
)

markdown, metadata = convert_with_metadata(
    html,
    options,
    metadata_config=config,
)
use html_to_markdown_rs::{
    convert_with_metadata, ConversionOptions, HeadingStyle, MetadataConfig,
};

let options = ConversionOptions {
    heading_style: HeadingStyle::Atx,
    wrap: true,
    wrap_width: 80,
    ..Default::default()
};

let config = MetadataConfig {
    extract_headers: true,
    extract_links: true,
    ..Default::default()
};

let (markdown, metadata) = convert_with_metadata(html, Some(options), config, None)?;

Further Reading