diff --git a/README.md b/README.md index 06859e1..67891b0 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ go get github.com/JohannesKaufmann/html-to-markdown ## Usage ```go -import "github.com/JohannesKaufmann/html-to-markdown" +import md "github.com/JohannesKaufmann/html-to-markdown" converter := md.NewConverter("", true, nil) diff --git a/commonmark.go b/commonmark.go index 8c4a2d1..70a3f3e 100644 --- a/commonmark.go +++ b/commonmark.go @@ -16,7 +16,7 @@ import ( var multipleSpacesR = regexp.MustCompile(` +`) var commonmark = []Rule{ - Rule{ + { Filter: []string{"ul", "ol"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { parent := selec.Parent() @@ -42,7 +42,7 @@ var commonmark = []Rule{ return &content }, }, - Rule{ + { Filter: []string{"li"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { if strings.TrimSpace(content) == "" { @@ -68,7 +68,7 @@ var commonmark = []Rule{ return String(prefix + content + "\n") }, }, - Rule{ + { Filter: []string{"#text"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { text := selec.Text() @@ -85,7 +85,7 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"p", "div"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { parent := goquery.NodeName(selec.Parent()) @@ -101,7 +101,7 @@ var commonmark = []Rule{ return &content }, }, - Rule{ + { Filter: []string{"h1", "h2", "h3", "h4", "h5", "h6"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { if strings.TrimSpace(content) == "" { @@ -141,7 +141,7 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"strong", "b"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { // only use one bold tag if they are nested @@ -162,7 +162,7 @@ var commonmark = []Rule{ return &trimmed }, }, - Rule{ + { Filter: []string{"i", "em"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { // only use one italic tag if they are nested @@ -183,7 +183,7 @@ var commonmark = []Rule{ return &trimmed }, }, - Rule{ + { Filter: []string{"img"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { alt := selec.AttrOr("alt", "") @@ -209,7 +209,7 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"a"}, AdvancedReplacement: func(content string, selec *goquery.Selection, opt *Options) (AdvancedResult, bool) { // if there is no href, no link is used. So just return the content inside the link @@ -270,7 +270,7 @@ var commonmark = []Rule{ return AdvancedResult{Markdown: replacement, Footer: reference}, false }, }, - Rule{ + { Filter: []string{"code"}, Replacement: func(_ string, selec *goquery.Selection, opt *Options) *string { content := selec.Text() @@ -280,7 +280,7 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"pre"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { codeElement := selec.Find("code") @@ -301,20 +301,20 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"hr"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { text := "\n\n" + opt.HorizontalRule + "\n\n" return &text }, }, - Rule{ + { Filter: []string{"br"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { return String("\n\n") }, }, - Rule{ + { Filter: []string{"blockquote"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { content = strings.TrimSpace(content) @@ -331,7 +331,7 @@ var commonmark = []Rule{ return &text }, }, - Rule{ + { Filter: []string{"noscript"}, Replacement: func(content string, selec *goquery.Selection, opt *Options) *string { // for now remove the contents of noscript. But in the future we could diff --git a/examples/add_rules/main.go b/examples/add_rules/main.go index 8f63447..9078e78 100644 --- a/examples/add_rules/main.go +++ b/examples/add_rules/main.go @@ -10,8 +10,8 @@ import ( ) func main() { - html := `Good sountrack and cake.` - // -> `Good sountrack ~and cake~.` + html := `Good soundtrack and cake.` + // -> `Good soundtrack ~~and cake~~.` /* We want to add a rule when a `span` tag has a class of `bb_strike`. @@ -31,7 +31,7 @@ func main() { // Because of the space it is not recognized as strikethrough. // -> trim spaces at begin&end of string when inside strong/italic/... content = strings.TrimSpace(content) - return md.String("~" + content + "~") + return md.String("~~" + content + "~~") }, } diff --git a/from.go b/from.go index 31c7a30..556f11a 100644 --- a/from.go +++ b/from.go @@ -237,7 +237,7 @@ type Plugin func(conv *Converter) []Rule func (c *Converter) Use(plugins ...Plugin) *Converter { for _, plugin := range plugins { rules := plugin(c) - c.AddRules(rules...) // TODO: for better perfomance only use one lock for all plugins + c.AddRules(rules...) // TODO: for better performance only use one lock for all plugins } return c } diff --git a/markdown.go b/markdown.go index 8c1789d..c2498f3 100644 --- a/markdown.go +++ b/markdown.go @@ -65,6 +65,9 @@ var inlineElements = []string{ // -> https://developer.mozilla.org/de/docs/Web/H "button", "input", "label", "select", "textarea", } +// IsInlineElement can be used to check wether a node name (goquery.Nodename) is +// an html inline element and not a block element. Used in the rule for the +// p tag to check wether the text is inside a block element. func IsInlineElement(e string) bool { for _, element := range inlineElements { if element == e { @@ -132,6 +135,8 @@ type Options struct { domain string } +// AdvancedResult is used for example for links. If you use LinkStyle:referenced +// the link href is placed at the bottom of the generated markdown (Footer). type AdvancedResult struct { Header string Markdown string diff --git a/plugin/confluence_code_block.go b/plugin/confluence_code_block.go index d0d9103..78daea3 100644 --- a/plugin/confluence_code_block.go +++ b/plugin/confluence_code_block.go @@ -14,7 +14,7 @@ func ConfluenceCodeBlock() md.Plugin { return func(c *md.Converter) []md.Rule { character := "```" return []md.Rule{ - md.Rule{ + { Filter: []string{"ac:structured-macro"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { for _, node := range selec.Nodes { diff --git a/plugin/table.go b/plugin/table.go index a743394..467629e 100644 --- a/plugin/table.go +++ b/plugin/table.go @@ -10,13 +10,13 @@ import ( // EXPERIMENTAL_Table converts a html table to markdown. var EXPERIMENTAL_Table = []md.Rule{ - md.Rule{ // TableCell + { // TableCell Filter: []string{"th", "td"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { return md.String(cell(content, selec)) }, }, - md.Rule{ // TableRow + { // TableRow Filter: []string{"tr"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { borderCells := "" diff --git a/plugin/task_list.go b/plugin/task_list.go index 990960c..c06617f 100644 --- a/plugin/task_list.go +++ b/plugin/task_list.go @@ -9,7 +9,7 @@ import ( func TaskListItems() md.Plugin { return func(c *md.Converter) []md.Rule { return []md.Rule{ - md.Rule{ + { Filter: []string{"input"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { if !selec.Parent().Is("li") { diff --git a/plugin/vimeo.go b/plugin/vimeo.go index d84fd19..f916ae4 100644 --- a/plugin/vimeo.go +++ b/plugin/vimeo.go @@ -13,6 +13,7 @@ import ( "github.com/PuerkitoBio/goquery" ) +// Timeout for the http client var Timeout = time.Second * 10 var netClient = &http.Client{ Timeout: Timeout, @@ -46,12 +47,15 @@ var vimeoID = regexp.MustCompile(`video\/(\d*)`) type vimeoVariation int +// Configure how the Vimeo Plugin should display the video in markdown. const ( VimeoOnlyThumbnail vimeoVariation = iota VimeoWithTitle VimeoWithDescription ) +// EXPERIMENTAL_VimeoEmbed registers a rule (for iframes) and +// returns a markdown compatible representation (link to video, ...). func EXPERIMENTAL_VimeoEmbed(variation vimeoVariation) md.Plugin { return func(c *md.Converter) []md.Rule { getVimeoData := func(id string) (*vimeoVideo, error) { @@ -89,7 +93,7 @@ func EXPERIMENTAL_VimeoEmbed(variation vimeoVariation) md.Plugin { } return []md.Rule{ - md.Rule{ + { Filter: []string{"iframe"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { src := selec.AttrOr("src", "") diff --git a/plugin/youtube.go b/plugin/youtube.go index 35cf218..d8c4ae9 100644 --- a/plugin/youtube.go +++ b/plugin/youtube.go @@ -11,8 +11,10 @@ import ( var youtubeID = regexp.MustCompile(`youtube\.com\/embed\/([^\&\?\/]+)`) +// EXPERIMENTAL_YoutubeEmbed registers a rule (for iframes) and +// returns a markdown compatible representation (link to video, ...). var EXPERIMENTAL_YoutubeEmbed = []md.Rule{ - md.Rule{ + { Filter: []string{"iframe"}, Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { src := selec.AttrOr("src", "") diff --git a/utils.go b/utils.go index 857c114..3b551d1 100644 --- a/utils.go +++ b/utils.go @@ -11,6 +11,12 @@ import ( "golang.org/x/net/html" ) +/* +WARNING: The functions from this file can be used externally +but there is no garanty that they will stay exported. +*/ + +// CollectText returns the text of the node and all its children func CollectText(n *html.Node) string { text := &bytes.Buffer{} collectText(n, text) @@ -26,7 +32,8 @@ func collectText(n *html.Node, buf *bytes.Buffer) { } } -// always have a space to the side to recognize the delimiter +// AddSpaceIfNessesary adds spaces to the text based on the neighbors. +// That makes sure that there is always a space to the side, to recognize the delimiter. func AddSpaceIfNessesary(selec *goquery.Selection, text string) string { var prev string @@ -92,6 +99,8 @@ func AddSpaceIfNessesary(selec *goquery.Selection, text string) string { return text } +// TrimpLeadingSpaces removes spaces from the beginning of a line +// but makes sure that list items and code blocks are not affected. func TrimpLeadingSpaces(text string) string { parts := strings.Split(text, "\n") for i := range parts { @@ -128,6 +137,7 @@ func TrimpLeadingSpaces(text string) string { return strings.Join(parts, "\n") } +// TrimTrailingSpaces removes unnecessary spaces from the end of lines. func TrimTrailingSpaces(text string) string { parts := strings.Split(text, "\n") for i := range parts { @@ -143,6 +153,7 @@ func TrimTrailingSpaces(text string) string { // The same as `multipleNewLinesRegex`, but applies to escaped new lines inside a link `\n\` var multipleNewLinesInLinkRegex = regexp.MustCompile(`(\n\\){1,}`) // `([\n\r\s]\\)` +// EscapeMultiLine deals with multiline content inside a link func EscapeMultiLine(content string) string { content = strings.TrimSpace(content) content = strings.Replace(content, "\n", `\`+"\n", -1) @@ -152,7 +163,7 @@ func EscapeMultiLine(content string) string { return content } -// Cal can be passed the content of a code block and it returns +// CalculateCodeFence can be passed the content of a code block and it returns // how many fence characters (` or ~) should be used. // // This is useful if the html content includes the same fence characters