Usage¶
Basic Conversion¶
convert() accepts an HTML string and returns a ConversionResult.
use html_to_markdown_rs::convert;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>";
let result = convert(html, None)?;
let markdown = result.content.unwrap_or_default();
println!("{markdown}");
Ok(())
}
from html_to_markdown import convert
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
result = convert(html)
markdown = result.content
import { convert } from "@kreuzberg/html-to-markdown";
const result = convert("<h1>Hello World</h1>");
const markdown: string = result.content;
console.log(markdown); // # Hello World
package main
import (
"fmt"
"log"
htmltomarkdown "github.com/kreuzberg-dev/html-to-markdown/packages/go/v3"
)
func main() {
html := "<h1>Hello World</h1><p>This is a paragraph.</p>"
result, err := htmltomarkdown.Convert(html, nil)
if err != nil {
log.Fatal(err)
}
if result.Content != nil {
fmt.Println(*result.Content)
}
}
require 'html_to_markdown'
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
result = HtmlToMarkdown.convert(html)
markdown = result[:content]
use HtmlToMarkdown\HtmlToMarkdown;
$result = HtmlToMarkdown::convert('<h1>Hello</h1><p>This is <strong>fast</strong>!</p>');
echo $result->content;
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class Example {
public static void main(String[] args) {
String html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
ConversionResult result = HtmlToMarkdown.convert(html);
System.out.println(result.content());
}
}
using HtmlToMarkdown;
var html = "<h1>Hello World</h1><p>This is a paragraph.</p>";
var result = HtmlToMarkdownConverter.Convert(html);
Console.WriteLine(result.Content);
{:ok, result} = HtmlToMarkdown.convert("<h1>Hello</h1><p>This is <strong>fast</strong>!</p>")
IO.puts(result.content)
library(htmltomarkdown)
html <- "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
result <- convert(html)
markdown <- result$content
cat(markdown)
#include "html_to_markdown.h"
#include <stdio.h>
int main(void) {
HTMConversionResult *result = htm_convert("<h1>Hello</h1><p>World</p>", NULL);
if (result == NULL) {
fprintf(stderr, "convert failed (code %d): %s\n",
htm_last_error_code(), htm_last_error_context());
return 1;
}
char *content = htm_conversion_result_content(result);
if (content != NULL) {
printf("%s\n", content);
htm_free_string(content);
}
htm_conversion_result_free(result);
return 0;
}
import init, { convert } from "@kreuzberg/html-to-markdown-wasm";
await init();
const html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>";
const result = convert(html);
const markdown = result.content;
console.log(markdown);
import HtmlToMarkdown
let html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
let result = try convert(html, nil)
let markdown = result.content()?.toString() ?? ""
print(markdown)
import 'package:h2m/h2m.dart';
import 'package:h2m/src/html_to_markdown_rs_bridge_generated/frb_generated.dart'
show RustLib;
Future<void> main() async {
await RustLib.init();
const html = '<h1>Hello</h1><p>This is <strong>fast</strong>!</p>';
final result = await H2mBridge.convert(html);
print(result.content);
}
import dev.kreuzberg.android.HtmlToMarkdownRs
val html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
val result = HtmlToMarkdownRs.convert(html)
val markdown: String? = result.content
const std = @import("std");
const html_to_markdown = @import("html_to_markdown");
pub fn main() !void {
const html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>";
const result_json = try html_to_markdown.convert(html, null);
defer std.heap.c_allocator.free(result_json);
// result_json is the ConversionResult serialised as JSON; parse with
// std.json or read the `content` field directly.
std.debug.print("{s}\n", .{result_json});
}
ConversionResult Fields¶
Every call to convert() returns a ConversionResult with the following fields:
| Field | Type | Description |
|---|---|---|
content |
Optional<String> |
The converted text (Markdown, Djot, or plain). None/null when output_format is "none". |
document |
Optional<DocumentStructure> |
Structured document tree (headings, paragraphs, lists, tables). Only populated when include_document_structure is true. |
metadata |
HtmlMetadata |
Extracted HTML metadata (title, description, Open Graph, Twitter Card, JSON-LD, links, images). |
tables |
Vec<TableData> |
Extracted tables with full grid data (headers, rows, colspan/rowspan). |
images |
Vec<ExtractedImage> |
Extracted inline images (data URIs, embedded SVGs). Only populated when extract_images is true. |
warnings |
Vec<ProcessingWarning> |
Non-fatal warnings raised during conversion. |
Using Options¶
Control output style, metadata extraction, and more via ConversionOptions.
use html_to_markdown_rs::{ConversionOptions, HeadingStyle, convert};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let options = ConversionOptions::builder()
.heading_style(HeadingStyle::Atx)
.skip_images(true)
.build();
let result = convert("<h1>Hello</h1><img src='pic.jpg'>", Some(options))?;
println!("{}", result.content.unwrap_or_default());
Ok(())
}
from html_to_markdown import ConversionOptions, convert
html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>"
options = ConversionOptions(
heading_style="atx",
list_indent_width=2,
)
result = convert(html, options)
markdown = result.content
import { convert, ConversionOptions } from "@kreuzberg/html-to-markdown";
const options: ConversionOptions = {
headingStyle: "atx",
listIndentWidth: 2,
wrap: true,
};
const result = convert("<h1>Title</h1><p>Content</p>", options);
const markdown = result.content;
package main
import (
"fmt"
"log"
htmltomarkdown "github.com/kreuzberg-dev/html-to-markdown/packages/go/v3"
)
func main() {
html := "<h1>Hello</h1><p>Welcome</p>"
width := uint(80)
opts := htmltomarkdown.ConversionOptions{
Wrap: true,
WrapWidth: &width,
}
result, err := htmltomarkdown.Convert(html, &opts)
if err != nil {
log.Fatalf("Conversion failed: %v", err)
}
if result.Content != nil {
fmt.Println(*result.Content)
}
}
require 'html_to_markdown'
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
markdown = result[:content]
use HtmlToMarkdown\HtmlToMarkdown;
use HtmlToMarkdown\ConversionOptions;
$options = ConversionOptions::builder()
->headingStyle('atx')
->listIndentWidth(2)
->build();
$result = HtmlToMarkdown::convert('<h1>Hello</h1>', $options);
echo $result->content;
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class MetadataExample {
public static void main(String[] args) {
String html = "<html><head><title>My Page</title></head>"
+ "<body><h1>Welcome</h1><a href=\"https://example.com\">Link</a></body></html>";
ConversionOptions options = ConversionOptions.builder()
.extractMetadata(true)
.build();
ConversionResult result = HtmlToMarkdown.convert(html, options);
System.out.println("Markdown: " + result.content());
System.out.println("Title: " + result.metadata().document().title());
System.out.println("Headers: " + result.metadata().headers().size());
System.out.println("Links: " + result.metadata().links().size());
}
}
using HtmlToMarkdown;
var options = new ConversionOptions
{
HeadingStyle = "atx",
Wrap = true,
WrapWidth = 80,
ListIndentWidth = 4,
};
var html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>";
var result = HtmlToMarkdownConverter.Convert(html, options);
Console.WriteLine(result.Content);
opts = %HtmlToMarkdown.Options{wrap: true, wrap_width: 40}
{:ok, result} = HtmlToMarkdown.convert("<h1>Hello</h1><p>World</p>", opts)
IO.puts(result.content)
library(htmltomarkdown)
opts <- conversion_options(
heading_style = "atx",
wrap = TRUE,
wrap_width = 80L
)
result <- convert("<h1>Hello</h1><p>World</p>", opts)
cat(result$content)
import HtmlToMarkdown
let options = try conversionOptionsFromJson(
"{\"heading_style\":\"atx\",\"list_indent_width\":2,\"wrap\":true}"
)
let html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>"
let result = try convert(html, options)
let markdown = result.content()?.toString() ?? ""
print(markdown)
import 'package:h2m/h2m.dart';
import 'package:h2m/src/html_to_markdown_rs_bridge_generated/frb_generated.dart'
show RustLib;
Future<void> main() async {
await RustLib.init();
final options = await createConversionOptionsFromJson(
json: '{"heading_style":"atx","list_indent_width":2,"wrap":true}',
);
const html = '<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>';
final result = await H2mBridge.convert(html, options: options);
print(result.content);
}
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.PropertyNamingStrategies
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import dev.kreuzberg.android.ConversionOptions
import dev.kreuzberg.android.HtmlToMarkdownRs
val mapper = ObjectMapper()
.registerKotlinModule()
.setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE)
val options = mapper.readValue(
"{\"heading_style\":\"Atx\",\"list_indent_width\":2,\"wrap\":true}",
ConversionOptions::class.java,
)
val html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>"
val result = HtmlToMarkdownRs.convert(html, options)
val markdown: String? = result.content
const std = @import("std");
const html_to_markdown = @import("html_to_markdown");
pub fn main() !void {
const html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>";
const options_json =
\\{"heading_style":"atx","list_indent_width":2,"wrap":true}
;
const result_json = try html_to_markdown.convert(html, options_json);
defer std.heap.c_allocator.free(result_json);
std.debug.print("{s}\n", .{result_json});
}
Metadata Extraction¶
Enable extract_metadata to populate the metadata field with structured data parsed from the HTML <head> and document body.
use html_to_markdown_rs::{convert, ConversionOptions};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let html = r#"<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>"#;
let options = ConversionOptions::builder()
.extract_metadata(true)
.build();
let result = convert(html, Some(options))?;
let markdown = result.content.unwrap_or_default();
println!("Markdown: {}", markdown);
println!("Title: {:?}", result.metadata.as_ref().and_then(|m| m.title.as_deref()));
println!("Links: {:?}", result.metadata.as_ref().map(|m| &m.links));
Ok(())
}
from html_to_markdown import ConversionOptions, convert
options = ConversionOptions(
extract_metadata=True,
extract_images=True,
)
result = convert(html, options)
markdown = result.content
metadata = result.metadata
import { convert, ConversionOptions } from "@kreuzberg/html-to-markdown";
const options: ConversionOptions = { extractMetadata: true };
const result = convert("<h1>Title</h1><p>Content</p>", options);
console.log(result.content); // Converted markdown
console.log(result.metadata?.document); // Document metadata (title, description, etc.)
console.log(result.metadata?.headers); // Header elements (h1-h6)
console.log(result.metadata?.links); // Extracted links
console.log(result.metadata?.images); // Extracted images
package main
import (
"fmt"
"log"
htmltomarkdown "github.com/kreuzberg-dev/html-to-markdown/packages/go/v3"
)
func main() {
html := `<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>`
// extract_metadata defaults to true; nil options is enough.
result, err := htmltomarkdown.Convert(html, nil)
if err != nil {
log.Fatal(err)
}
if result.Content != nil {
fmt.Println("Markdown:", *result.Content)
}
if result.Metadata.Document.Title != nil {
fmt.Println("Title:", *result.Metadata.Document.Title)
}
for _, link := range result.Metadata.Links {
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
}
}
require 'html_to_markdown'
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
result = HtmlToMarkdown.convert(html, extract_metadata: true)
markdown = result[:content]
puts result[:metadata][:document][:title] # "Test"
puts result[:metadata][:headers].first[:text] # "Hello"
use HtmlToMarkdown\HtmlToMarkdown;
$html = '<html><head><title>Example</title></head><body><h1>Welcome</h1><a href="https://example.com">Link</a></body></html>';
// extract_metadata defaults to true.
$result = HtmlToMarkdown::convert($html);
echo $result->content;
echo $result->metadata->document->title;
foreach ($result->metadata->links as $link) {
echo $link->href . ': ' . $link->text;
}
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class MetadataExample {
public static void main(String[] args) {
String html = """
<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>
""";
ConversionOptions options = ConversionOptions.builder()
.extractMetadata(true)
.build();
ConversionResult result = HtmlToMarkdown.convert(html, options);
System.out.println("Markdown: " + result.content());
System.out.println("Title: " + result.metadata().getTitle());
System.out.println("Links: " + result.metadata().getLinks());
}
}
using HtmlToMarkdown;
var html = @"<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href=""https://example.com"">Link</a></body></html>";
var options = new ConversionOptions { ExtractMetadata = true };
var result = HtmlToMarkdownConverter.Convert(html, options);
Console.WriteLine($"Markdown: {result.Content}");
Console.WriteLine($"Title: {result.Metadata?.Title}");
Console.WriteLine($"Links: {string.Join(", ", result.Metadata?.Links ?? [])}");
Metadata Extraction - Elixir¶
Extract structured metadata from HTML documents during conversion.
Basic Metadata Extraction¶
Use convert/2 with extract_metadata: true in options to extract document metadata alongside Markdown:
html = """
<html>
<head>
<title>Example</title>
<meta name="description" content="Demo page">
</head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com" rel="nofollow external">Example link</a>
</body>
</html>
"""
opts = %HtmlToMarkdown.Options{extract_metadata: true}
{:ok, result} = HtmlToMarkdown.convert(html, opts)
result.metadata["document"]["title"] # "Example"
result.metadata["headers"] |> hd() |> Map.get("text") # "Welcome"
result.metadata["links"] |> hd() |> Map.get("link_type") # "external"
Extracted Metadata Structure¶
The metadata map includes:
- Document: Title and meta tags from
<head> - Headers: All headings extracted with level, text, and optional ID
- Links: All links with href, text, rel attributes, and link_type classification
- Images: Image sources and alt text
- Forms: Form action and method data
- Other: Tables, code blocks, and additional structural information
library(htmltomarkdown)
html <- '
<html>
<head><title>Example</title></head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com">Example link</a>
</body>
</html>'
opts <- conversion_options(extract_metadata = TRUE)
result <- convert(html, opts)
cat(result$content)
result$metadata$document$title
result$metadata$headers[[1]]$text
result$metadata$links[[1]]$link_type
import HtmlToMarkdown
let options = try conversionOptionsFromJson(
"{\"extract_metadata\":true,\"extract_images\":true}"
)
let html = """
<html>
<head>
<title>My Page</title>
<meta name="description" content="A short description.">
<meta name="author" content="Jane Doe">
</head>
<body>
<h1>Welcome</h1>
<p>See <a href="https://example.com">example</a>.</p>
</body>
</html>
"""
let result = try convert(html, options)
let markdown = result.content()?.toString() ?? ""
let metadata = result.metadata()
let document = metadata.document()
print("title:", document.title()?.toString() ?? "")
print("description:", document.description()?.toString() ?? "")
print("author:", document.author()?.toString() ?? "")
print("headers:", metadata.headers().count)
print("links:", metadata.links().count)
print("images:", metadata.images().count)
print(markdown)
import 'package:h2m/h2m.dart';
import 'package:h2m/src/html_to_markdown_rs_bridge_generated/frb_generated.dart'
show RustLib;
Future<void> main() async {
await RustLib.init();
final options = await createConversionOptionsFromJson(
json: '{"extract_metadata":true}',
);
final result = await H2mBridge.convert(
'<html><head><title>Example</title>'
'<meta name="description" content="A sample page">'
'</head><body><h1>Hello</h1><a href="https://example.com">link</a></body></html>',
options: options,
);
print(result.content);
print(result.metadata.document.title); // "Example"
print(result.metadata.document.description); // "A sample page"
print(result.metadata.headers); // List<HeaderMetadata>
print(result.metadata.links); // List<LinkMetadata>
print(result.metadata.images); // List<ImageMetadata>
}
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.PropertyNamingStrategies
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import dev.kreuzberg.android.ConversionOptions
import dev.kreuzberg.android.HtmlToMarkdownRs
val mapper = ObjectMapper()
.registerKotlinModule()
.setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE)
val options = mapper.readValue(
"{\"extract_metadata\":true}",
ConversionOptions::class.java,
)
val html = """
<html>
<head>
<title>My Page</title>
<meta name="description" content="A short description">
</head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body>
</html>
""".trimIndent()
val result = HtmlToMarkdownRs.convert(html, options)
println("Markdown: ${result.content}")
println("Title: ${result.metadata.document.title}")
println("Description: ${result.metadata.document.description}")
println("Headers: ${result.metadata.headers}")
println("Links: ${result.metadata.links}")
println("Images: ${result.metadata.images}")
const std = @import("std");
const html_to_markdown = @import("html_to_markdown");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const html =
\\<html><head>
\\<title>Example Page</title>
\\<meta name="description" content="A short description.">
\\<meta name="author" content="Jane Doe">
\\<link rel="canonical" href="https://example.com/page">
\\</head><body><h1>Hello</h1></body></html>
;
// `convert` returns the ConversionResult as a JSON string. Metadata
// extraction is controlled by the `extract_metadata` option (default true).
const result_json = try html_to_markdown.convert(html, "{\"extract_metadata\":true}");
defer std.heap.c_allocator.free(result_json);
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
defer parsed.deinit();
const document = parsed.value.object.get("metadata").?.object.get("document").?.object;
if (document.get("title")) |v| {
if (v == .string) std.debug.print("Title: {s}\n", .{v.string});
}
if (document.get("description")) |v| {
if (v == .string) std.debug.print("Description: {s}\n", .{v.string});
}
if (document.get("author")) |v| {
if (v == .string) std.debug.print("Author: {s}\n", .{v.string});
}
if (document.get("canonical_url")) |v| {
if (v == .string) std.debug.print("Canonical: {s}\n", .{v.string});
}
}
Metadata Fields¶
result.metadata is an HtmlMetadata with five top-level fields: document, headers, links, images, and structured_data. Everything is populated in a single pass.
document (DocumentMetadata)¶
| Field | Type | Description |
|---|---|---|
title |
Option<String> |
Page title from the <title> element. |
description |
Option<String> |
<meta name="description"> content. |
keywords |
Vec<String> |
Parsed <meta name="keywords">, split on commas. |
author |
Option<String> |
<meta name="author"> content. |
canonical_url |
Option<String> |
<link rel="canonical"> href. |
base_href |
Option<String> |
<base href="../…"> value. |
language |
Option<String> |
lang attribute on <html>. |
text_direction |
Option<TextDirection> |
dir attribute on <html>. One of left_to_right, right_to_left, auto. |
open_graph |
BTreeMap<String, String> |
All og:* meta tags keyed by property (without the og: prefix). |
twitter_card |
BTreeMap<String, String> |
All twitter:* meta tags keyed by name (without the prefix). |
meta_tags |
BTreeMap<String, String> |
Every other <meta name> tag, keyed by name. |
headers, links, images, structured_data¶
| Field | Description |
|---|---|
headers |
HeaderMetadata entries for every <h1>–<h6> with level, text, and id. |
links |
LinkMetadata entries for every <a> with href, text, rel values, and classified link_type. |
images |
ImageMetadata entries for every <img> with src, alt, dimensions, and classified image_type. |
structured_data |
JSON-LD, Microdata, and RDFa blocks with a data_type tag and the raw content. |
links[].link_type¶
| Value | Matches |
|---|---|
anchor |
href starts with # (same-page anchors). |
internal |
relative href or href that resolves inside the document's own host. |
external |
absolute URL on a different host. |
email |
mailto: URI. |
phone |
tel: URI. |
other |
anything else (javascript:, data:, custom schemes). |
images[].image_type¶
| Value | Matches |
|---|---|
data_uri |
src starts with data:. |
inline_svg |
inline <svg> element (captured when extract_images is enabled). |
external |
absolute URL on a remote host. |
relative |
relative path or same-host URL. |
structured_data[].data_type¶
| Value | Matches |
|---|---|
json_ld |
<script type="application/ld+json"> blocks. |
microdata |
itemscope/itemprop subtrees. |
rdfa |
typeof/property subtrees. |
Document Structure Extraction¶
Enable include_document_structure to get a parsed tree of the document's structural elements.
use html_to_markdown_rs::{convert, ConversionOptions};
let options = ConversionOptions::builder()
.include_document_structure(true)
.build();
let result = convert("<h1>Title</h1><p>Paragraph</p>", Some(options))?;
if let Some(doc) = &result.document {
for node in &doc.nodes {
println!("{:?}", node);
}
}
```
=== "Python"
```python
from html_to_markdown import ConversionOptions, convert
options = ConversionOptions(include_document_structure=True)
result = convert("<h1>Title</h1><p>Paragraph</p>", options)
doc = result.document
for node in doc.nodes:
print(node)
```
=== "TypeScript"
```typescript
import { convert, ConversionOptions } from '@kreuzberg/html-to-markdown';
const options: ConversionOptions = { includeDocumentStructure: true };
const result = convert('<h1>Title</h1><p>Paragraph</p>', options);
const nodes = result.document?.nodes ?? [];
for (const node of nodes) {
console.log(node);
}
```
---
!!! question "Found a bug or mistake on this page?"
If something here is wrong or out of date, [open an issue](https://github.com/kreuzberg-dev/html-to-markdown/issues/new?labels=documentation) on GitHub or [contribute a fix](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md) via pull request.