Skip to content

Visitor Pattern Guide v2.23.0

This guide shows how to create custom visitors that modify HTML-to-Markdown conversion behavior. Visitors let you intercept specific HTML elements and control their output without modifying the library.

For the conceptual overview, see Visitor Pattern Concepts.


Basic Visitor Example

The simplest visitor implements one or more visit_* methods. Each method receives context about the current element and returns a result indicating how to handle it.

from html_to_markdown import convert_with_visitor

class CustomVisitor:
    def visit_link(self, ctx, href, text, title):
        # Custom link handling
        return {"type": "continue"}

    def visit_image(self, ctx, src, alt, title):
        # Custom image handling
        return {"type": "continue"}

markdown = convert_with_visitor(html, visitor=CustomVisitor())
import { convertWithVisitor } from '@kreuzberg/html-to-markdown';
import { Visitor, NodeContext, VisitResult } from '@kreuzberg/html-to-markdown';

const visitor: Visitor = {
  visitLink(ctx: NodeContext, href: string, text: string): VisitResult {
    // Custom handling for links
    return {
      type: 'custom',
      output: `[${text}](${href})`,
    };
  },
  visitHeading(ctx: NodeContext, level: number, text: string): VisitResult {
    // Custom handling for headings
    return {
      type: 'continue',
    };
  },
};

const markdown = convertWithVisitor('<h1>Title</h1><a href="url">Link</a>', {
  visitor,
});
require 'html_to_markdown'

class MyVisitor
  def visit_link(ctx, href, text, title = nil)
    { type: :custom, output: "[#{text}](#{href})" }
  end

  def visit_image(ctx, src, alt, title = nil)
    { type: :skip }  # Remove images
  end
end

html = "<p><a href='https://example.com'>Link</a></p>"
result = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
use HtmlToMarkdown\Visitor\AbstractVisitor;
use HtmlToMarkdown\Visitor\NodeContext;
use HtmlToMarkdown\Visitor\VisitResult;
use HtmlToMarkdown\Service\Converter;

class CustomVisitor extends AbstractVisitor
{
    public function visitImage(NodeContext $context, string $src, string $alt, ?string $title): array
    {
        // Skip all images
        return VisitResult::skip();
    }

    public function visitLink(NodeContext $context, string $href, string $text, ?string $title): array
    {
        // Custom link handling
        return VisitResult::custom("[{$text}]({$href})");
    }
}

$converter = Converter::create();
$markdown = $converter->convertWithVisitor(
    '<a href="/page">Link</a><img src="pic.png" alt="pic">',
    null,
    new CustomVisitor()
);
#include "html_to_markdown.h"
#include <stdio.h>
#include <string.h>

/* Custom heading visitor: prefix all headings with a section marker */
static HtmlToMarkdownVisitResult visit_heading(
    const HtmlToMarkdownVisitHeadingData *data, void *user_data) {
    (void)user_data;
    HtmlToMarkdownVisitResult result = {0};
    /* Use default conversion for all headings */
    result.type = Continue;
    return result;
}

int main(void) {
    const char *html = "<h1>Title</h1><p>Content</p>";

    html_to_markdown_visitor_callbacks_t callbacks = {0};
    callbacks.visit_heading = (struct Option_HtmlToMarkdownVisitHeadingCallback){
        .is_some = true,
        .value = visit_heading,
    };

    HtmlToMarkdownVisitor *visitor = html_to_markdown_visitor_new(&callbacks);
    char *markdown = html_to_markdown_convert_with_visitor(html, visitor);
    if (markdown) {
        printf("%s\n", markdown);
        html_to_markdown_free_string(markdown);
    }
    html_to_markdown_visitor_free(visitor);
    return 0;
}
defmodule MyVisitor do
  use HtmlToMarkdown.Visitor

  @impl true
  def handle_link(_context, _href, text, _title) do
    {:custom, text}
  end

  @impl true
  def handle_image(_context, _src, _alt, _title) do
    :skip
  end
end

html = "<p><a href='https://example.com'>Link</a> <img src='pic.png'></p>"
{:ok, markdown} = HtmlToMarkdown.Visitor.convert_with_visitor(html, MyVisitor, nil)
library(htmltomarkdown)

html <- "<p>Visit <a href='https://example.com'>our site</a></p>"
markdown <- convert_with_visitor(html)
cat(markdown)
use html_to_markdown_rs::convert_with_visitor;
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};

#[derive(Debug)]
struct StripLinksVisitor;

impl HtmlVisitor for StripLinksVisitor {
    fn visit_link(
        &mut self,
        _ctx: &NodeContext,
        _href: &str,
        text: &str,
        _title: Option<&str>,
    ) -> VisitResult {
        // Convert links to plain text
        VisitResult::Custom(text.to_string())
    }

    fn visit_image(
        &mut self,
        _ctx: &NodeContext,
        _src: &str,
        _alt: &str,
        _title: Option<&str>,
    ) -> VisitResult {
        VisitResult::Skip
    }
}

Return Values

Every visitor callback must return one of these result types:

Continue

Use the default conversion for this element. This is the default when a callback is not implemented.

def visit_link(self, ctx, href, text, title):
    return {"type": "continue"}

Custom

Replace the element's output with your own Markdown string:

def visit_link(self, ctx, href, text, title):
    return {"type": "custom", "output": f"[{text}]({href})"}

Skip

Remove the element entirely from the output:

def visit_image(self, ctx, src, alt, title):
    return {"type": "skip"}

Preserve HTML

Keep the raw HTML in the Markdown output:

def visit_link(self, ctx, href, text, title):
    return {"type": "preserve_html"}

Error

Stop conversion and return an error:

def visit_link(self, ctx, href, text, title):
    if "javascript:" in href:
        return {"type": "error", "reason": "JavaScript URLs not allowed"}
    return {"type": "continue"}

Common Use Cases

Remove All Images

class NoImagesVisitor:
    def visit_image(self, ctx, src, alt, title):
        return {"type": "skip"}

markdown = convert_with_visitor(html, visitor=NoImagesVisitor())

Convert links to plain text:

class PlainTextLinksVisitor:
    def visit_link(self, ctx, href, text, title):
        return {"type": "custom", "output": text}

markdown = convert_with_visitor(html, visitor=PlainTextLinksVisitor())

URL Rewriting

Rewrite relative URLs to absolute URLs:

from urllib.parse import urljoin

class AbsoluteUrlVisitor:
    def __init__(self, base_url):
        self.base_url = base_url

    def visit_link(self, ctx, href, text, title):
        absolute = urljoin(self.base_url, href)
        title_attr = f' "{title}"' if title else ""
        return {"type": "custom", "output": f"[{text}]({absolute}{title_attr})"}

    def visit_image(self, ctx, src, alt, title):
        absolute = urljoin(self.base_url, src)
        title_attr = f' "{title}"' if title else ""
        return {"type": "custom", "output": f"![{alt}]({absolute}{title_attr})"}

visitor = AbsoluteUrlVisitor("https://example.com")
markdown = convert_with_visitor(html, visitor=visitor)
import { convertWithVisitor, Visitor, NodeContext, VisitResult } from '@kreuzberg/html-to-markdown';

const baseUrl = 'https://example.com';

const visitor: Visitor = {
  visitLink(ctx: NodeContext, href: string, text: string): VisitResult {
    const absolute = new URL(href, baseUrl).toString();
    return { type: 'custom', output: `[${text}](${absolute})` };
  },
  visitImage(ctx: NodeContext, src: string, alt: string): VisitResult {
    const absolute = new URL(src, baseUrl).toString();
    return { type: 'custom', output: `![${alt}](${absolute})` };
  },
};

const markdown = convertWithVisitor(html, { visitor });

Security Filtering

Block dangerous URLs:

class SafeLinksVisitor:
    BLOCKED_SCHEMES = {"javascript:", "data:", "vbscript:"}

    def visit_link(self, ctx, href, text, title):
        href_lower = href.lower().strip()
        for scheme in self.BLOCKED_SCHEMES:
            if href_lower.startswith(scheme):
                return {"type": "custom", "output": text}  # Strip the link
        return {"type": "continue"}

markdown = convert_with_visitor(html, visitor=SafeLinksVisitor())

Custom Heading Anchors

Add Markdown anchor syntax to headings:

import re

class AnchoredHeadingsVisitor:
    def visit_heading(self, ctx, level, text, heading_id):
        slug = heading_id or re.sub(r'[^\w-]', '', text.lower().replace(' ', '-'))
        hashes = "#" * level
        return {"type": "custom", "output": f"{hashes} {text} {{#{slug}}}"}

markdown = convert_with_visitor(html, visitor=AnchoredHeadingsVisitor())

Platform-Specific Formatting

Generate Slack-compatible Markdown:

class SlackVisitor:
    def visit_strong(self, ctx, text):
        return {"type": "custom", "output": f"*{text}*"}

    def visit_emphasis(self, ctx, text):
        return {"type": "custom", "output": f"_{text}_"}

    def visit_strikethrough(self, ctx, text):
        return {"type": "custom", "output": f"~{text}~"}

    def visit_code_block(self, ctx, language, code):
        return {"type": "custom", "output": f"```\n{code}\n```"}

Async Visitors

Python and TypeScript support async visitors for callbacks that need to perform I/O operations:

import asyncio
from html_to_markdown import convert_with_async_visitor

class AsyncUrlValidator:
    async def visit_link(self, ctx, href, text, title):
        # Could check URL validity with an HTTP request
        if href.startswith("http"):
            return {"type": "continue"}
        return {"type": "custom", "output": text}

markdown = convert_with_async_visitor(html, visitor=AsyncUrlValidator())
import { convertWithVisitor } from '@kreuzberg/html-to-markdown';

const asyncVisitor = {
  async visitLink(ctx, href, text) {
    // Could perform async URL validation
    return { type: 'continue' };
  },
};

const markdown = await convertWithVisitor(html, { visitor: asyncVisitor });

Async overhead

Async visitors have slightly more overhead than sync visitors due to the runtime bridging. Use sync visitors when your callbacks do not need I/O.


Node Context

Every visitor callback receives a context object with metadata about the current element:

def visit_link(self, ctx, href, text, title):
    print(ctx.tag_name)        # "a"
    print(ctx.depth)           # Nesting depth in DOM
    print(ctx.parent_tag)      # Parent element tag (e.g., "p")
    print(ctx.is_inline)       # True for inline elements
    print(ctx.attributes)      # Dict of HTML attributes
    print(ctx.index_in_parent) # Sibling index
    return {"type": "continue"}

You can use context to make conditional decisions. For example, only modify links inside a specific parent element:

def visit_link(self, ctx, href, text, title):
    if ctx.parent_tag == "nav":
        return {"type": "skip"}  # Skip navigation links
    return {"type": "continue"}

Combining Visitors with Options

Visitors work alongside ConversionOptions. Options control the default behavior, while visitors override specific elements:

from html_to_markdown import ConversionOptions, convert_with_visitor

class MyVisitor:
    def visit_image(self, ctx, src, alt, title):
        return {"type": "skip"}

options = ConversionOptions(
    heading_style="atx",
    wrap=True,
    wrap_width=80,
)

markdown = convert_with_visitor(html, options, visitor=MyVisitor())

Combining Visitors with Metadata

In Rust, the convert_with_metadata function accepts an optional visitor, allowing both metadata extraction and custom conversion in a single pass:

let visitor = Some(visitor_handle);
let (markdown, metadata) = convert_with_metadata(html, Some(options), config, visitor)?;

Further Reading