Java提取markdown中的表格
说明
这篇博文是一个舍近求远的操作,如果只需要要对markdown中的表格数据进行提取,完全可以通过正在表达式或者字符串切分来完成。但是鉴于学习的目的,这次采用了commonmark包中的工具来完成。具体实现过程如下
实现步骤
引入pom依赖
xml
<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>0.21.0</version>
</dependency>
<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark-ext-gfm-tables</artifactId>
<version>0.21.0</version>
</dependency>
自定义vistor
java
import org.commonmark.ext.gfm.tables.*;
import org.commonmark.node.*;
import java.util.ArrayList;
import java.util.List;
public class TableVisitor extends AbstractVisitor {
private boolean inHeader = false;
private boolean inBody = false;
private List<String> currentRow = null;
private List<String> headers = new ArrayList<>();
private final List<List<String>> rows = new ArrayList<>();
@Override
public void visit(CustomBlock customBlock) {
if (customBlock instanceof TableBlock) {
handleTableBlock((TableBlock) customBlock);
} else {
super.visit(customBlock);
}
}
@Override
public void visit(CustomNode customNode) {
if (customNode instanceof TableHead) {
handleTableHead((TableHead) customNode);
} else if (customNode instanceof TableBody) {
handleTableBody((TableBody) customNode);
} else if (customNode instanceof TableRow) {
handleTableRow((TableRow) customNode);
} else if (customNode instanceof TableCell) {
handleTableCell((TableCell) customNode);
} else {
super.visit(customNode);
}
}
private void handleTableBlock(TableBlock tableBlock) {
// 重置状态
inHeader = false;
inBody = false;
visitChildren(tableBlock);
}
private void handleTableHead(TableHead tableHead) {
inHeader = true;
visitChildren(tableHead);
inHeader = false;
}
private void handleTableBody(TableBody tableBody) {
inBody = true;
visitChildren(tableBody);
inBody = false;
}
private void handleTableRow(TableRow tableRow) {
currentRow = new ArrayList<>();
visitChildren(tableRow);
if (inHeader) {
this.headers = currentRow;
} else if (inBody) {
this.rows.add(currentRow);
}
}
private void handleTableCell(TableCell tableCell) {
if (currentRow != null) {
currentRow.add(getTextContent(tableCell));
}
visitChildren(tableCell);
}
private String getTextContent(Node node) {
StringBuilder sb = new StringBuilder();
Node child = node.getFirstChild();
while (child != null) {
if (child instanceof Text) {
sb.append(((Text) child).getLiteral());
}
child = child.getNext();
}
return sb.toString().trim();
}
public List<String> getTableHeaders() {
return headers;
}
public List<List<String>> getTableRows() {
return rows;
}
}
测试用例
java
public static void main(String[] args) {
String content = """
| 姓名 | 性别 | 班级 | 年龄 |
|--------------|------|--------------------|--------------------|
| 张三 | 男 | 兴趣一班 | 17 |
| 李四 | 男 | 兴趣一班 | 16 |
""";
List<Extension> extensions = Arrays.asList(TablesExtension.create());
Parser parser = Parser.builder().extensions(extensions).build();
Node document = parser.parse(content);
TableVisitor visitor = new TableVisitor();
document.accept(visitor);
List<String> tableHeaders = visitor.getTableHeaders();
List<List<String>> tableRows = visitor.getTableRows();
System.out.println("表头: " + tableHeaders);
System.out.println("表格行数据: "+ tableRows);
}
总结
由于没有在commonmark中找到我们需要的vistor,所以自定义了vistor。希望可以对其他同学有所帮助吧。