Usage¶
Basic Conversion¶
convert() accepts an HTML string and returns a ConversionResult.
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown"
)
func main() {
html := "<h1>Hello World</h1><p>This is a paragraph.</p>"
result, err := htmltomarkdown.Convert(html)
if err != nil {
log.Fatal(err)
}
if result.Content != nil {
fmt.Println(*result.Content)
}
}
use HtmlToMarkdown\Service\Converter;
use function HtmlToMarkdown\convert;
// Object-oriented usage
$converter = Converter::create();
$result = $converter->convert('<h1>Hello</h1><p>This is <strong>fast</strong>!</p>');
$markdown = $result['content'];
// Procedural helper
$result = convert('<h1>Hello</h1>');
$markdown = $result['content'];
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class Example {
public static void main(String[] args) {
String html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
ConversionResult result = HtmlToMarkdown.convert(html);
System.out.println(result.content());
}
}
#include "html_to_markdown.h"
#include <stdio.h>
int main(void) {
const char *html = "<h1>Hello</h1><p>World</p>";
/* Returns JSON: {"content":"...","metadata":null,"tables":null} */
char *json = html_to_markdown_convert(html, NULL);
if (json) {
/* Parse JSON to extract content field */
printf("%s\n", json);
html_to_markdown_free_string(json);
}
return 0;
}
ConversionResult Fields¶
Every call to convert() returns a ConversionResult with the following fields:
| Field | Type | Description |
|---|---|---|
content |
Optional<String> |
The converted text (Markdown, Djot, or plain). None/null when output_format is "none". |
document |
Optional<DocumentStructure> |
Structured document tree (headings, paragraphs, lists, tables). Only populated when include_document_structure is true. |
metadata |
HtmlMetadata |
Extracted HTML metadata (title, description, Open Graph, Twitter Card, JSON-LD, links, images). |
tables |
Vec<TableData> |
Extracted tables with full grid data (headers, rows, colspan/rowspan). |
images |
Vec<ExtractedImage> |
Extracted inline images (data URIs, embedded SVGs). Only populated when extract_images is true. |
warnings |
Vec<ProcessingWarning> |
Non-fatal warnings raised during conversion. |
Using Options¶
Control output style, metadata extraction, and more via ConversionOptions.
use html_to_markdown_rs::{convert, ConversionOptions};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let options = ConversionOptions::builder()
.heading_style(HeadingStyle::Atx)
.skip_images(true)
.build();
let result = convert("<h1>Hello</h1><img src='pic.jpg'>", Some(options))?;
let markdown = result.content.unwrap_or_default();
println!("{markdown}");
Ok(())
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown"
)
func main() {
// Check library version
version := htmltomarkdown.Version()
fmt.Printf("html-to-markdown version: %s\n", version)
html := "<h1>Hello</h1><p>Welcome</p>"
// Convert with error handling
result, err := htmltomarkdown.Convert(html)
if err != nil {
log.Fatalf("Conversion failed: %v", err)
}
if result.Content != nil {
fmt.Println(*result.Content)
}
}
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class MetadataExample {
public static void main(String[] args) {
String html = "<html><head><title>My Page</title></head>"
+ "<body><h1>Welcome</h1><a href=\"https://example.com\">Link</a></body></html>";
ConversionOptions options = ConversionOptions.builder()
.extractMetadata(true)
.build();
ConversionResult result = HtmlToMarkdown.convert(html, options);
System.out.println("Markdown: " + result.content());
System.out.println("Title: " + result.metadata().document().title());
System.out.println("Headers: " + result.metadata().headers().size());
System.out.println("Links: " + result.metadata().links().size());
}
}
using HtmlToMarkdown;
var options = new ConversionOptions
{
HeadingStyle = "atx",
Wrap = true,
WrapWidth = 80,
ListIndentWidth = 4,
};
var html = "<h1>Hello</h1><p>This is <strong>formatted</strong> content.</p>";
var result = HtmlToMarkdownConverter.Convert(html, options);
Console.WriteLine(result.Content);
Metadata Extraction¶
Enable extract_metadata to populate the metadata field with structured data parsed from the HTML <head> and document body.
use html_to_markdown_rs::{convert, ConversionOptions};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let html = r#"<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>"#;
let options = ConversionOptions::builder()
.extract_metadata(true)
.build();
let result = convert(html, Some(options))?;
let markdown = result.content.unwrap_or_default();
println!("Markdown: {}", markdown);
println!("Title: {:?}", result.metadata.as_ref().and_then(|m| m.title.as_deref()));
println!("Links: {:?}", result.metadata.as_ref().map(|m| &m.links));
Ok(())
}
from html_to_markdown import ConversionOptions, convert
options = ConversionOptions(
extract_metadata=True,
extract_headers=True,
extract_links=True,
extract_images=True,
extract_structured_data=True,
max_structured_data_size=100000,
)
result = convert(html, options)
markdown = result["content"]
metadata = result["metadata"]
import { convert, ConversionOptions } from '@kreuzberg/html-to-markdown';
const options: ConversionOptions = { extractMetadata: true };
const result = convert('<h1>Title</h1><p>Content</p>', options);
console.log(result.content); // Converted markdown
console.log(result.metadata?.document); // Document metadata (title, description, etc.)
console.log(result.metadata?.headers); // Header elements (h1-h6)
console.log(result.metadata?.links); // Extracted links
console.log(result.metadata?.images); // Extracted images
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown"
)
func main() {
html := `<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>`
opts := htmltomarkdown.ConversionOptions{ExtractMetadata: true}
result, err := htmltomarkdown.Convert(html, opts)
if err != nil {
log.Fatal(err)
}
if result.Content != nil {
fmt.Println("Markdown:", *result.Content)
}
if result.Metadata != nil {
fmt.Println("Title:", result.Metadata.Title)
fmt.Println("Links:", result.Metadata.Links)
}
}
require 'html_to_markdown'
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
result = HtmlToMarkdown.convert(html, extract_metadata: true)
markdown = result[:content]
puts result[:metadata][:document][:title] # "Test"
puts result[:metadata][:headers].first[:text] # "Hello"
use HtmlToMarkdown\Config\ConversionOptions;
use HtmlToMarkdown\Service\Converter;
$html = '<html><head><title>Example</title></head><body><h1>Welcome</h1><a href="https://example.com">Link</a></body></html>';
$converter = Converter::create();
$result = $converter->convert(
$html,
new ConversionOptions(
headingStyle: 'Atx',
extractMetadata: true,
extractHeaders: true,
extractLinks: true,
extractImages: true,
)
);
echo $result['content'];
echo $result['metadata']->document->title;
foreach ($result['metadata']->links as $link) {
echo $link->href . ': ' . $link->text;
}
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
public class MetadataExample {
public static void main(String[] args) {
String html = """
<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href="https://example.com">Link</a></body></html>
""";
ConversionOptions options = ConversionOptions.builder()
.extractMetadata(true)
.build();
ConversionResult result = HtmlToMarkdown.convert(html, options);
System.out.println("Markdown: " + result.content());
System.out.println("Title: " + result.metadata().getTitle());
System.out.println("Links: " + result.metadata().getLinks());
}
}
using HtmlToMarkdown;
var html = @"<html><head><title>My Page</title></head>
<body><h1>Hello</h1><a href=""https://example.com"">Link</a></body></html>";
var options = new ConversionOptions { ExtractMetadata = true };
var result = HtmlToMarkdownConverter.Convert(html, options);
Console.WriteLine($"Markdown: {result.Content}");
Console.WriteLine($"Title: {result.Metadata?.Title}");
Console.WriteLine($"Links: {string.Join(", ", result.Metadata?.Links ?? [])}");
Metadata Extraction - Elixir¶
Extract structured metadata from HTML documents during conversion.
Basic Metadata Extraction¶
Use convert/2 with extract_metadata: true in options to extract document metadata alongside Markdown:
html = """
<html>
<head>
<title>Example</title>
<meta name="description" content="Demo page">
</head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com" rel="nofollow external">Example link</a>
</body>
</html>
"""
opts = %HtmlToMarkdown.Options{extract_metadata: true}
{:ok, result} = HtmlToMarkdown.convert(html, opts)
result.metadata["document"]["title"] # "Example"
result.metadata["headers"] |> hd() |> Map.get("text") # "Welcome"
result.metadata["links"] |> hd() |> Map.get("link_type") # "external"
Extracted Metadata Structure¶
The metadata map includes:
- Document: Title and meta tags from
<head> - Headers: All headings extracted with level, text, and optional ID
- Links: All links with href, text, rel attributes, and link_type classification
- Images: Image sources and alt text
- Forms: Form action and method data
- Other: Tables, code blocks, and additional structural information
library(htmltomarkdown)
html <- '
<html>
<head><title>Example</title></head>
<body>
<h1 id="welcome">Welcome</h1>
<a href="https://example.com">Example link</a>
</body>
</html>'
opts <- conversion_options(extract_metadata = TRUE)
result <- convert(html, opts)
cat(result$content)
result$metadata$document$title
result$metadata$headers[[1]]$text
result$metadata$links[[1]]$link_type
Metadata Fields¶
| Field | Description |
|---|---|
document.title |
Page title from <title> tag |
document.description |
Content of <meta name="description"> |
document.language |
lang attribute of <html> tag |
document.charset |
Character encoding declaration |
document.open_graph |
Open Graph tags (og:title, og:description, etc.) |
document.twitter_card |
Twitter Card tags |
document.json_ld |
JSON-LD structured data blocks |
headers |
All <h1>–<h6> elements with level, text, and id |
links |
All <a> tags with href, text, rel, and link type |
images |
All <img> tags with src, alt, width, height |
Document Structure Extraction¶
Enable include_document_structure to get a parsed tree of the document's structural elements.
use html_to_markdown_rs::{convert, ConversionOptions};
let options = ConversionOptions::builder()
.include_document_structure(true)
.build();
let result = convert("<h1>Title</h1><p>Paragraph</p>", Some(options))?;
if let Some(doc) = &result.document {
for node in &doc.nodes {
println!("{:?}", node);
}
}
import { convert, ConversionOptions } from '@kreuzberg/html-to-markdown';
const options: ConversionOptions = { includeDocumentStructure: true };
const result = convert('<h1>Title</h1><p>Paragraph</p>', options);
const nodes = result.document?.nodes ?? [];
for (const node of nodes) {
console.log(node);
}