简单的解析器，将 OneNote 的 HTML 转换为 Markdown

colinfang

1.00/5 (1投票)

2013 年 1 月 13 日

CPOL

2分钟阅读

28890

412

OneNote2Markdown 将 OneNote 生成的 HTML 转换为 Markdown 格式，然后可以通过任何在线 Markdown 解析器将其转换为更简洁、规范化的 HTML。

引言

本文介绍了 OneNote2Markdown，一个我编写的解析器，它可以将 OneNote 生成的 HTML 文件（通过发送到 Word 并另存为 HTML）转换为 Markdown 格式，然后可以通过任何在线 Markdown 解析器转换为更简洁的 HTML。

该工具使用 F# 编写，适用于 OneNote 2010 和 Word 2010。它仅处理普通段落、标题、链接、列表、内联代码和代码块。
该工具从 "input.html" 读取并写入 "output.txt"。
最新版本的源代码可以在 Bitbucket 上查看。编译需要 HtmlAgilityPack。
示例包包含本文的 docx、html 和 Markdown 格式，这将提供一个关于该工具如何工作的基本演示。

背景

我倾向于在 OneNote 中记笔记。当我第一次尝试提交用 OneNote 编写的文章时，手动将内容调整到 Code Project 的模板中确实很麻烦。因此，我决定编写一个解析器，为我自动执行大部分格式化工作。

实现概述

准备工作

一个活动模式，用于模式匹配，以确定文本节点是否具有特定的祖先节点，例如 <b> 或 <i>。
```
let (|HasAncestor|) tag (node: HtmlNode) =
    node.Ancestors(tag) |> Seq.isEmpty |> not
```

一个函数，用于挖掘文本节点从 style 属性继承的特定 CSS 属性。

let getPartialStyle cssProperty (node: HtmlNode) =
    let predicate node =
        // "property1:value1;property2:value2"
        let myMatch = Regex.Match(getStyle node, sprintf "%s:(.+?)(;|$)" cssProperty)
        if myMatch.Success then
            Some myMatch.Groups.[1].Value
        else None
    // Gets the value for the closest cssProperty.
    node.Ancestors("span") |> Seq.tryPick predicate

一个函数，用于获取节点从 style 属性的特定 CSS 属性。

let getPartialStyleSelf cssProperty (node: HtmlNode) =
    let myMatch = Regex.Match(getStyle node, sprintf "%s:(.+?)(;|$)" cssProperty)
    if myMatch.Success then
        Some myMatch.Groups.[1].Value
    else
        None

标题

通过检查段落的 font-size 和 color CSS 属性以及它是否具有 <b> 或 <i> 祖先节点来确定段落的标题类型。

match font, color, node with
| Some "16.0pt", Some "#17365D", (HasAncestor "b" true)   -> H 1
| Some "13.0pt", Some "#366092", (HasAncestor "b" true)   -> H 2
| Some "11.0pt", Some "#366092", (HasAncestor "b" true)  & (HasAncestor "i" false) -> H 3
| Some "11.0pt", Some "#366092", (HasAncestor "b" true)  & (HasAncestor "i" true)  -> H 4
| Some "11.0pt", Some "#366092", (HasAncestor "b" false) & (HasAncestor "i" false) -> H 5
| Some "11.0pt", Some "#366092", (HasAncestor "b" false) & (HasAncestor "i" true)  -> H 6
| _ -> Normal

使用 ## heading ## 语法，以便 Markdown 解析器不会“吃掉”标题中包含的最后一个 #。
```
let headIt n text =
    String.Format("{1} {0} {1}", text, (String (Array.create n '#')))
```

代码

任何字体为 Consolas 的文本都被认为是代码，否则不是。

match getPartialStyle "font-family" textNode with
| Some "Consolas" -> varIt text
| _ -> text

简化 Markdown 语法，如果它们由空格分隔，则将多个内联代码片段组合成一个（例如，a b -> a b）。保留前导空格，以保护缩进和带有代码块的空行（内部内容非平凡且不会稍后删除，例如 a -> a）。但是，存在一个限制，即文本本身不能包含 `。
```
let simplifyVar (text: string) =
    Regex.Replace(text, @"(?<=.)`(\s*)`", "$1")
```

区分代码块和内联代码。

let tryGetPureCode (text: string) =
    let myMatch = (Regex.Match(text, @"^`([^`]*)`$"))
    if myMatch.Success then
        Some (myMatch.Result "$1")
    else
        None

列表

通过符号区分有序列表和无序列表。没有符号的列表被认为是没有任何缩进的普通段落。

let listIt x text =
    match x with
    | "o" | "·" -> sprintf "*  %s" text
    | _         -> sprintf "1. %s" text

通过 margin-left:54.0pt CSS 属性获取缩进。

let getIndent (node: HtmlNode) =
    let getMargin (x: string) =
        let unit = 27 // each level is 27
        let array = x.Split '.'
        let (success, x) = Int32.TryParse array.[0]
        if success then x / unit
        else failwith "indent parse error!"
    match getPartialStyleSelf "margin-left" node with
    | Some x -> getMargin x
    | None -> 0

链接

通过查找其祖先中的 <a> 来检查一段文本是否包含链接。

match textNode with
| (HasAncestor "a" true) ->
    let ancestor_a = textNode.Ancestors("a") |> Seq.head
    linkIt text (ancestor_a.GetAttributeValue("href", "none"))
| _ -> text

最终化

获取整个内容的正确缩进和段落间距。

/// Assumes in OneNote there are no spaces in front of a code block (indent by tabs).
/// Assumes in OneNote the internal indentations of a code block are either all tabs or all spaces, never mixed.
let review paragraphs =
    // indentOffset is used for nesting indentations.
    // If a benchmark line with indentation a, actually indents x, we set indentOffset = a - x.
    // So any line with indentation b, does actually indent b - indentOffset = b - a + x.
    let mutable listIndentOffset = 0
    let mutable codeIndentOffset = 0
    let oldCopy = paragraphs |> Seq.toArray
    let newCopy = Array.zeroCreate oldCopy.Length
    // Looks at the current paragraph and the previous paragraph.
    // I don't care about the first paragraph as it will be the title.
    // Uses "\r\n" so that Notepad reads correctly.
    for i in 1 .. oldCopy.Length - 1 do
        match oldCopy.[i - 1], oldCopy.[i] with
        | (Code _ | Listing _) , (Heading text | Basic text) ->
            // Code block / list block ends, prepends and appends new lines, and resets both indentOffsets.
            newCopy.[i] <- sprintf "\r\n%s\r\n" text
            listIndentOffset <- 0
            codeIndentOffset <- 0

        | (Heading _ | Basic _), (Heading text | Basic text) ->
            // Appends a new line.
            newCopy.[i] <- sprintf "%s\r\n" text

        | Code (_, a)          , Code (text, b)              ->
            // Don't add a new line in between code blocks.
            newCopy.[i] <- indentIt (b - codeIndentOffset) text

        | (Heading _ | Basic _), Code (text, b)              ->
            // Code block starts, cache codeIndentOffset
            // Indents 1 level only as Heading or Basic indents none.
            newCopy.[i] <- indentIt 1 text
            codeIndentOffset <- b - 1

        | Listing (_, a)       , Code (text, b)              ->
            // Code block within a list requires 1 additional level on top of the list indentation.
            // Code block starts, cache codeIndentOffset.
            // Prepends a new line.
            newCopy.[i] <- sprintf "\r\n%s" (indentIt (b - listIndentOffset + 1) text)
            codeIndentOffset <- listIndentOffset - 1

        | Listing (_, a)       , Listing (text, b)           ->
            // Don't add  a new line in between list blocks.
            newCopy.[i] <- indentIt (b - listIndentOffset) text

        | Code (_, a)          , Listing (text, b)           ->
            // Code block ends, reset codeIndentOffset.
            // Prepends a new line.
            codeIndentOffset <- 0
            newCopy.[i] <- sprintf "\r\n%s" (indentIt (b - listIndentOffset) text)

        | (Heading _ | Basic _), Listing (text, b)           ->
            // List block starts, cache listIndentOffset
            listIndentOffset <- b
            newCopy.[i] <- text
    newCopy