Table Extraction¶
Every call to convert() populates result.tables with one entry per <table> found in the input. Each entry has both a rendered Markdown string and a structured cell grid, so you can embed the Markdown in downstream documents or walk the grid for analysis without re-parsing.
Table extraction runs on every call. There is no opt-in flag. Set output_format to "none" if you only want the table data and not the rendered content.
TableData¶
result.tables is a Vec<TableData> (or the equivalent list in each binding).
| Field | Type | Description |
|---|---|---|
grid |
TableGrid |
Structured cell grid. |
markdown |
String |
The Markdown rendering of this table, identical to what appears in result.content. |
TableGrid¶
| Field | Type | Description |
|---|---|---|
rows |
u32 |
Number of rows in the table. |
cols |
u32 |
Number of columns in the table. |
cells |
Vec<GridCell> |
Flat list of cells. May be shorter than rows * cols when cells span multiple rows or columns. |
GridCell¶
| Field | Type | Description |
|---|---|---|
content |
String |
Cell text. Inline formatting is flattened to plain text. |
row |
u32 |
0-indexed row position. |
col |
u32 |
0-indexed column position. |
row_span |
u32 |
How many rows the cell occupies. Defaults to 1. |
col_span |
u32 |
How many columns the cell occupies. Defaults to 1. |
is_header |
bool |
true for <th>, false for <td>. |
Basic Extraction¶
use html_to_markdown_rs::{ConversionOptions, convert};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let html = r#"
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
"#;
let result = convert(html, Some(ConversionOptions::default()))?;
for table in &result.tables {
for cell in &table.grid.cells {
let kind = if cell.is_header { "Header" } else { "Cell" };
println!(" {kind} (r{},c{}): {}", cell.row, cell.col, cell.content);
}
}
Ok(())
}
from html_to_markdown import ConversionOptions, convert
html = """
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
"""
result = convert(html, ConversionOptions())
for table in result.tables:
for cell in table.grid.cells:
prefix = "Header" if cell.is_header else "Cell"
print(f" {prefix} (r{cell.row},c{cell.col}): {cell.content}")
import { convert } from "@kreuzberg/html-to-markdown";
const html = `
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
`;
const result = convert(html);
for (const table of result.tables ?? []) {
for (const cell of table.grid.cells ?? []) {
const kind = cell.isHeader ? "Header" : "Cell";
console.log(` ${kind} (r${cell.row},c${cell.col}): ${cell.content}`);
}
}
import (
"fmt"
"log"
htmltomarkdown "github.com/kreuzberg-dev/html-to-markdown/packages/go/v3"
)
html := `
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
`
result, err := htmltomarkdown.Convert(html, nil)
if err != nil {
log.Fatal(err)
}
for _, table := range result.Tables {
for _, cell := range table.Grid.Cells {
kind := "Cell"
if cell.IsHeader {
kind = "Header"
}
fmt.Printf(" %s (r%d,c%d): %s\n", kind, cell.Row, cell.Col, cell.Content)
}
}
require 'html_to_markdown'
html = <<~HTML
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
HTML
result = HtmlToMarkdown.convert(html, extract_tables: true)
result[:tables].each do |table|
table[:cells].each_with_index do |row, i|
prefix = table[:is_header_row][i] ? "Header" : "Row"
puts " #{prefix}: #{row.join(', ')}"
end
end
use HtmlToMarkdown\HtmlToMarkdown;
$html = <<<HTML
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
HTML;
$result = HtmlToMarkdown::convert($html);
foreach ($result->tables as $table) {
foreach ($table->grid->cells as $cell) {
$kind = $cell->isHeader ? 'Header' : 'Cell';
echo " {$kind} (r{$cell->row},c{$cell->col}): {$cell->content}\n";
}
}
import dev.kreuzberg.htmltomarkdown.HtmlToMarkdown;
import dev.kreuzberg.htmltomarkdown.ConversionOptions;
import dev.kreuzberg.htmltomarkdown.ConversionResult;
String html = """
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
""";
ConversionResult result = HtmlToMarkdown.convert(html, new ConversionOptions());
for (var table : result.tables()) {
for (var cell : table.grid().cells()) {
String prefix = cell.isHeader() ? "Header" : "Cell";
System.out.printf(" %s (r%d,c%d): %s%n", prefix, cell.row(), cell.col(), cell.content());
}
}
using HtmlToMarkdown;
var html = @"
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>";
var result = HtmlToMarkdownConverter.Convert(html);
foreach (var table in result.Tables)
{
foreach (var cell in table.Grid.Cells)
{
var kind = cell.IsHeader ? "Header" : "Cell";
Console.WriteLine($" {kind} (r{cell.Row},c{cell.Col}): {cell.Content}");
}
}
html = """
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
"""
opts = %HtmlToMarkdown.Options{extract_tables: true}
{:ok, result} = HtmlToMarkdown.convert(html, opts)
for %{cells: cells, is_header_row: is_header_row} <- result.tables do
cells
|> Enum.with_index()
|> Enum.each(fn {row, i} ->
prefix = if Enum.at(is_header_row, i), do: "Header", else: "Row"
IO.puts(" #{prefix}: #{Enum.join(row, ", ")}")
end)
end
library(htmltomarkdown)
html <- "
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
"
opts <- conversion_options(extract_tables = TRUE)
result <- convert(html, opts)
for (table in result$tables) {
for (i in seq_along(table$cells)) {
prefix <- if (table$is_header_row[[i]]) "Header" else "Row"
cat(sprintf(" %s: %s\n", prefix, paste(table$cells[[i]], collapse = ", ")))
}
}
#include "html_to_markdown.h"
#include <stdio.h>
int main(void) {
const char *html =
"<table>"
"<tr><th>Name</th><th>Age</th></tr>"
"<tr><td>Alice</td><td>30</td></tr>"
"<tr><td>Bob</td><td>25</td></tr>"
"</table>";
/* Tables are extracted by default. The simplest way to inspect them in
* C is to serialise the result to JSON and parse it with your JSON lib. */
HTMConversionResult *result = htm_convert(html, NULL);
if (result == NULL) {
fprintf(stderr, "convert failed: %s\n", htm_last_error_context());
return 1;
}
char *json = htm_conversion_result_to_json(result);
if (json != NULL) {
printf("%s\n", json); /* contains a "tables" array */
htm_free_string(json);
}
htm_conversion_result_free(result);
return 0;
}
import init, { convert } from "@kreuzberg/html-to-markdown-wasm";
await init();
const html = `
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
`;
const result = convert(html);
for (const table of result.tables ?? []) {
for (const cell of table.grid.cells ?? []) {
const kind = cell.isHeader ? "Header" : "Cell";
console.log(` ${kind} (r${cell.row},c${cell.col}): ${cell.content}`);
}
}
import HtmlToMarkdown
let html = """
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
"""
let result = try convert(html, nil)
for table in result.tables() {
print("Markdown:", table.markdown().toString())
let grid = table.grid()
print("Grid: \(grid.rows()) rows x \(grid.cols()) cols")
for cell in grid.cells() {
let kind = cell.is_header() ? "Header" : "Cell"
print(" \(kind) (r\(cell.row()),c\(cell.col())): \(cell.content().toString())")
}
}
import 'package:h2m/h2m.dart';
import 'package:h2m/src/html_to_markdown_rs_bridge_generated/frb_generated.dart'
show RustLib;
Future<void> main() async {
await RustLib.init();
const html = '''
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
''';
final result = await H2mBridge.convert(html);
for (final table in result.tables) {
for (final cell in table.grid.cells) {
final kind = cell.isHeader ? 'Header' : 'Cell';
print(' $kind (r${cell.row},c${cell.col}): ${cell.content}');
}
}
}
import dev.kreuzberg.android.ConversionOptions
import dev.kreuzberg.android.HtmlToMarkdownRs
val html = """
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>30</td></tr>
<tr><td>Bob</td><td>25</td></tr>
</table>
""".trimIndent()
val result = HtmlToMarkdownRs.convert(html, ConversionOptions.builder().build())
for (table in result.tables) {
println(table.markdown)
for (cell in table.grid.cells) {
val kind = if (cell.isHeader) "Header" else "Cell"
println(" $kind (r${cell.row},c${cell.col}): ${cell.content}")
}
}
const std = @import("std");
const html_to_markdown = @import("html_to_markdown");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const html =
\\<table>
\\ <tr><th>Name</th><th>Age</th></tr>
\\ <tr><td>Alice</td><td>30</td></tr>
\\ <tr><td>Bob</td><td>25</td></tr>
\\</table>
;
// Tables are always extracted into the `tables` array of ConversionResult.
const result_json = try html_to_markdown.convert(html, null);
defer std.heap.c_allocator.free(result_json);
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
defer parsed.deinit();
const tables = parsed.value.object.get("tables").?.array;
std.debug.print("Extracted {d} table(s)\n", .{tables.items.len});
for (tables.items, 0..) |table, i| {
const grid = table.object.get("grid").?.object;
const rows = grid.get("rows").?.integer;
const cols = grid.get("cols").?.integer;
const markdown = table.object.get("markdown").?.string;
std.debug.print("Table {d}: {d}x{d}\n{s}\n", .{ i, rows, cols, markdown });
for (grid.get("cells").?.array.items) |cell| {
const content = cell.object.get("content").?.string;
const row = cell.object.get("row").?.integer;
const col = cell.object.get("col").?.integer;
const is_header = cell.object.get("is_header").?.bool;
std.debug.print(" [{d},{d}] header={} '{s}'\n", .{ row, col, is_header, content });
}
}
}
Relationship to result.content¶
The Markdown in TableData.markdown is the same Markdown that appears inline inside result.content. The grid exists for code that needs cell-level access: headers vs body rows, span detection, or programmatic lookup by (row, col).
If the input has no tables, result.tables is an empty list. If the output format is "plain" or "none", tables are still extracted and their grids are still populated; only the Markdown rendering in result.content changes.
Spans¶
A cell with row_span > 1 or col_span > 1 appears once in cells, positioned at its top-left coordinates. Downstream code that iterates by (row, col) should respect the span or use the spans to reconstruct a dense grid.
Found a bug or mistake on this page?
If something here is wrong or out of date, open an issue on GitHub or contribute a fix via pull request.