Skip to content

Table Extraction

Every call to convert() populates result.tables with one entry per <table> found in the input. Each entry has both a rendered Markdown string and a structured cell grid, so you can embed the Markdown in downstream documents or walk the grid for analysis without re-parsing.

Table extraction runs on every call. There is no opt-in flag. Set output_format to "none" if you only want the table data and not the rendered content.

TableData

result.tables is a Vec<TableData> (or the equivalent list in each binding).

Field Type Description
grid TableGrid Structured cell grid.
markdown String The Markdown rendering of this table, identical to what appears in result.content.

TableGrid

Field Type Description
rows u32 Number of rows in the table.
cols u32 Number of columns in the table.
cells Vec<GridCell> Flat list of cells. May be shorter than rows * cols when cells span multiple rows or columns.

GridCell

Field Type Description
content String Cell text. Inline formatting is flattened to plain text.
row u32 0-indexed row position.
col u32 0-indexed column position.
row_span u32 How many rows the cell occupies. Defaults to 1.
col_span u32 How many columns the cell occupies. Defaults to 1.
is_header bool true for <th>, false for <td>.

Basic Extraction

use html_to_markdown_rs::{ConversionOptions, convert};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let html = r#"
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
"#;

    let result = convert(html, Some(ConversionOptions::default()))?;

    for table in &result.tables {
        for cell in &table.grid.cells {
            let kind = if cell.is_header { "Header" } else { "Cell" };
            println!("  {kind} (r{},c{}): {}", cell.row, cell.col, cell.content);
        }
    }
    Ok(())
}
from html_to_markdown import ConversionOptions, convert

html = """
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
"""

result = convert(html, ConversionOptions())

for table in result.tables:
    for cell in table.grid.cells:
        prefix = "Header" if cell.is_header else "Cell"
        print(f"  {prefix} (r{cell.row},c{cell.col}): {cell.content}")
import { convert } from "@kreuzberg/html-to-markdown";

const html = `
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
`;

const result = convert(html);

for (const table of result.tables ?? []) {
  for (const cell of table.grid.cells ?? []) {
    const kind = cell.isHeader ? "Header" : "Cell";
    console.log(`  ${kind} (r${cell.row},c${cell.col}): ${cell.content}`);
  }
}
import (
    "fmt"
    "log"

    htmltomarkdown "github.com/kreuzberg-dev/html-to-markdown/packages/go/v3"
)

html := `
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
`

result, err := htmltomarkdown.Convert(html, nil)
if err != nil {
    log.Fatal(err)
}

for _, table := range result.Tables {
    for _, cell := range table.Grid.Cells {
        kind := "Cell"
        if cell.IsHeader {
            kind = "Header"
        }
        fmt.Printf("  %s (r%d,c%d): %s\n", kind, cell.Row, cell.Col, cell.Content)
    }
}
require 'html_to_markdown'

html = <<~HTML
  <table>
      <tr><th>Name</th><th>Age</th></tr>
      <tr><td>Alice</td><td>30</td></tr>
      <tr><td>Bob</td><td>25</td></tr>
  </table>
HTML

result = HtmlToMarkdown.convert(html, extract_tables: true)

result[:tables].each do |table|
  table[:cells].each_with_index do |row, i|
    prefix = table[:is_header_row][i] ? "Header" : "Row"
    puts "  #{prefix}: #{row.join(', ')}"
  end
end
use HtmlToMarkdown\HtmlToMarkdown;

$html = <<<HTML
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
HTML;

$result = HtmlToMarkdown::convert($html);

foreach ($result->tables as $table) {
    foreach ($table->grid->cells as $cell) {
        $kind = $cell->isHeader ? 'Header' : 'Cell';
        echo "  {$kind} (r{$cell->row},c{$cell->col}): {$cell->content}\n";
    }
}
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;

String html = """
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
""";

ConversionResult result = HtmlToMarkdown.convert(html, new ConversionOptions());

for (var table : result.tables()) {
    for (var cell : table.grid().cells()) {
        String prefix = cell.isHeader() ? "Header" : "Cell";
        System.out.printf("  %s (r%d,c%d): %s%n", prefix, cell.row(), cell.col(), cell.content());
    }
}
using HtmlToMarkdown;

var html = @"
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>";

var result = HtmlToMarkdownConverter.Convert(html);

foreach (var table in result.Tables)
{
    foreach (var cell in table.Grid.Cells)
    {
        var kind = cell.IsHeader ? "Header" : "Cell";
        Console.WriteLine($"  {kind} (r{cell.Row},c{cell.Col}): {cell.Content}");
    }
}
html = """
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
"""

opts = %HtmlToMarkdown.Options{extract_tables: true}
{:ok, result} = HtmlToMarkdown.convert(html, opts)

for %{cells: cells, is_header_row: is_header_row} <- result.tables do
  cells
  |> Enum.with_index()
  |> Enum.each(fn {row, i} ->
    prefix = if Enum.at(is_header_row, i), do: "Header", else: "Row"
    IO.puts("  #{prefix}: #{Enum.join(row, ", ")}")
  end)
end
library(htmltomarkdown)

html <- "
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
"

opts <- conversion_options(extract_tables = TRUE)
result <- convert(html, opts)

for (table in result$tables) {
  for (i in seq_along(table$cells)) {
    prefix <- if (table$is_header_row[[i]]) "Header" else "Row"
    cat(sprintf("  %s: %s\n", prefix, paste(table$cells[[i]], collapse = ", ")))
  }
}
#include "html_to_markdown.h"
#include <stdio.h>

int main(void) {
    const char *html =
        "<table>"
        "<tr><th>Name</th><th>Age</th></tr>"
        "<tr><td>Alice</td><td>30</td></tr>"
        "<tr><td>Bob</td><td>25</td></tr>"
        "</table>";

    /* Tables are extracted by default. The simplest way to inspect them in
     * C is to serialise the result to JSON and parse it with your JSON lib. */
    HTMConversionResult *result = htm_convert(html, NULL);
    if (result == NULL) {
        fprintf(stderr, "convert failed: %s\n", htm_last_error_context());
        return 1;
    }

    char *json = htm_conversion_result_to_json(result);
    if (json != NULL) {
        printf("%s\n", json);  /* contains a "tables" array */
        htm_free_string(json);
    }

    htm_conversion_result_free(result);
    return 0;
}
import init, { convert } from "@kreuzberg/html-to-markdown-wasm";

await init();

const html = `
<table>
    <tr><th>Name</th><th>Age</th></tr>
    <tr><td>Alice</td><td>30</td></tr>
    <tr><td>Bob</td><td>25</td></tr>
</table>
`;

const result = convert(html);

for (const table of result.tables ?? []) {
  for (const cell of table.grid.cells ?? []) {
    const kind = cell.isHeader ? "Header" : "Cell";
    console.log(`  ${kind} (r${cell.row},c${cell.col}): ${cell.content}`);
  }
}
import HtmlToMarkdown

let html = """
<table>
  <tr><th>Name</th><th>Age</th></tr>
  <tr><td>Alice</td><td>30</td></tr>
  <tr><td>Bob</td><td>25</td></tr>
</table>
"""

let result = try convert(html, nil)

for table in result.tables() {
    print("Markdown:", table.markdown().toString())
    let grid = table.grid()
    print("Grid: \(grid.rows()) rows x \(grid.cols()) cols")
    for cell in grid.cells() {
        let kind = cell.is_header() ? "Header" : "Cell"
        print("  \(kind) (r\(cell.row()),c\(cell.col())): \(cell.content().toString())")
    }
}
import 'package:h2m/h2m.dart';
import 'package:h2m/src/html_to_markdown_rs_bridge_generated/frb_generated.dart'
    show RustLib;

Future<void> main() async {
  await RustLib.init();

  const html = '''
<table>
  <tr><th>Name</th><th>Age</th></tr>
  <tr><td>Alice</td><td>30</td></tr>
  <tr><td>Bob</td><td>25</td></tr>
</table>
''';

  final result = await H2mBridge.convert(html);

  for (final table in result.tables) {
    for (final cell in table.grid.cells) {
      final kind = cell.isHeader ? 'Header' : 'Cell';
      print('  $kind (r${cell.row},c${cell.col}): ${cell.content}');
    }
  }
}
import dev.kreuzberg.android.ConversionOptions
import dev.kreuzberg.android.HtmlToMarkdownRs

val html = """
    <table>
      <tr><th>Name</th><th>Age</th></tr>
      <tr><td>Alice</td><td>30</td></tr>
      <tr><td>Bob</td><td>25</td></tr>
    </table>
""".trimIndent()

val result = HtmlToMarkdownRs.convert(html, ConversionOptions.builder().build())

for (table in result.tables) {
    println(table.markdown)
    for (cell in table.grid.cells) {
        val kind = if (cell.isHeader) "Header" else "Cell"
        println("  $kind (r${cell.row},c${cell.col}): ${cell.content}")
    }
}
const std = @import("std");
const html_to_markdown = @import("html_to_markdown");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const html =
        \\<table>
        \\  <tr><th>Name</th><th>Age</th></tr>
        \\  <tr><td>Alice</td><td>30</td></tr>
        \\  <tr><td>Bob</td><td>25</td></tr>
        \\</table>
    ;

    // Tables are always extracted into the `tables` array of ConversionResult.
    const result_json = try html_to_markdown.convert(html, null);
    defer std.heap.c_allocator.free(result_json);

    var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
    defer parsed.deinit();

    const tables = parsed.value.object.get("tables").?.array;
    std.debug.print("Extracted {d} table(s)\n", .{tables.items.len});

    for (tables.items, 0..) |table, i| {
        const grid = table.object.get("grid").?.object;
        const rows = grid.get("rows").?.integer;
        const cols = grid.get("cols").?.integer;
        const markdown = table.object.get("markdown").?.string;
        std.debug.print("Table {d}: {d}x{d}\n{s}\n", .{ i, rows, cols, markdown });

        for (grid.get("cells").?.array.items) |cell| {
            const content = cell.object.get("content").?.string;
            const row = cell.object.get("row").?.integer;
            const col = cell.object.get("col").?.integer;
            const is_header = cell.object.get("is_header").?.bool;
            std.debug.print("  [{d},{d}] header={} '{s}'\n", .{ row, col, is_header, content });
        }
    }
}

Relationship to result.content

The Markdown in TableData.markdown is the same Markdown that appears inline inside result.content. The grid exists for code that needs cell-level access: headers vs body rows, span detection, or programmatic lookup by (row, col).

If the input has no tables, result.tables is an empty list. If the output format is "plain" or "none", tables are still extracted and their grids are still populated; only the Markdown rendering in result.content changes.

Spans

A cell with row_span > 1 or col_span > 1 appears once in cells, positioned at its top-left coordinates. Downstream code that iterates by (row, col) should respect the span or use the spans to reconstruct a dense grid.


Found a bug or mistake on this page?

If something here is wrong or out of date, open an issue on GitHub or contribute a fix via pull request.

Edit this page on GitHub