djot-0.1.2.4/0000755000000000000000000000000007346545000011005 5ustar0000000000000000djot-0.1.2.4/CHANGELOG.md0000644000000000000000000000314507346545000012621 0ustar0000000000000000# Revision history for djot ## 0.1.2.4 -- 2025-11-30 * Ensure that `'95--'96` doesn't get parsed as singlequoted. * Properly handle bare `'}` for right single-quote (#12). ## 0.1.2.3 -- 2025-09-27 * Fix swallowing of indentation in code under blockquote (#11). ## 0.1.2.2 -- 2024-10-04 * Allow list items with blank lines between divs (#10). * Fix parsing of indented tables (#8). ## 0.1.2.1 -- 2024-06-24 * Djot writer: include separator line in table when the table has non-default alignments but no header (#7). ## 0.1.2 -- 2024-05-10 * Allow `_` in symbols (see jgm/djot#296). * Add Lift derivations to AST datatypes (#5, Gideon Farrell) [API change]. ## 0.1.1.3 -- 2024-03-17 * Ensure that tables end when we hit a blank line (#4). * Fix parsing of table immediately after list (#4). ## 0.1.1.2 -- 2024-03-14 * Fix bug parsing regular paragraphs after list (#4). ## 0.1.1.1 -- 2024-03-03 * Revert "Djot.Blocks: use ByteString directly in `toIdentifier` (#1)" This caused problems for UTF-8 sequences that contained the byte 0xa0, which B8.words treats as a space character. * AST: avoid using B8.words in normalizeLabel. * Avoid using isSpace in attribute parsing. isSpace matches a byte 0x0a, which can break up a UTF-8 sequence. Limit to ASCII whitespace. * Add test with UTF-8 identifier. See jgm/pandoc#9541. ## 0.1.1.0 -- 2024-02-29 * Add Data instances to everything in the AST [API change]. * Ensure that block attributes are indented on subsequent lines (#2). * Djot.Blocks: use ByteString directly in `toIdentifier` (#1, Vaibhav Sagar). ## 0.1.0.0 -- 2024-02-14 * Initial release. djot-0.1.2.4/LICENSE0000644000000000000000000000204307346545000012011 0ustar0000000000000000Copyright (c) 2023 John MacFarlane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. djot-0.1.2.4/app/0000755000000000000000000000000007346545000011565 5ustar0000000000000000djot-0.1.2.4/app/Main.hs0000644000000000000000000000666107346545000013016 0ustar0000000000000000{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE Strict #-} module Main where import qualified Data.ByteString as B import Data.ByteString.Builder (hPutBuilder) import Djot ( ParseOptions(..), RenderOptions(..), SourcePosOption(..), parseDoc, renderHtml, renderDjot ) import System.Environment (getArgs) import System.IO (stderr, stdout, hPutStrLn) import System.Exit ( ExitCode(ExitFailure), exitWith, exitSuccess ) import Text.DocLayout (render) import Text.Read (readMaybe) import qualified Data.Text.IO as TIO data OutputFormat = Html | Djot | Ast deriving (Eq, Show) data WrapOption = Auto | Preserve | NoWrap deriving (Eq, Show) data Opts = Opts{ format :: OutputFormat , files :: [FilePath] , wrap :: WrapOption , columns :: Int , sourcePos :: SourcePosOption } parseOpts :: [String] -> IO Opts parseOpts = go Opts{ format = Html, files = [], wrap = Preserve, columns = 72, sourcePos = NoSourcePos } where go opts [] = pure opts go opts ("--wrap" : as) = case as of "auto" : as' -> go opts{ wrap = Auto } as' "preserve" : as' -> go opts{ wrap = Preserve } as' "none" : as' -> go opts{ wrap = NoWrap } as' _ -> err "--wrap must be followed by auto, preserve, or none" go opts ("--columns" : as) = case as of (a:as') | Just n <- readMaybe a -> go opts{ columns = n } as' _ -> err "--columns must be followed by a number" go opts ("--to" : as) = case as of "djot" : as' -> go opts{ format = Djot } as' "html" : as' -> go opts{ format = Html } as' "ast" : as' -> go opts{ format = Ast } as' _ -> err "--to must be followed by djot, html, or ast" go opts ("--sourcepos" : as) = case as of ("none":as') -> go opts{ sourcePos = NoSourcePos } as' ("block":as') -> go opts{ sourcePos = BlockSourcePos } as' ("all":as') -> go opts{ sourcePos = AllSourcePos } as' _ -> err "--sourcepos takes an argument (none|block|all)" go _opts ("--help" : _) = do putStrLn "djoths [options] [files]" putStrLn " --to djot|html*|ast" putStrLn " --wrap auto|preserve*|none" putStrLn " --columns NUMBER" putStrLn " --sourcepos none*|block|all" putStrLn " --help" exitSuccess go opts (xs@('-':_) : as) = case break (== '=') xs of -- support e.g. '--columns=33' (ys, '=':zs) -> go opts (ys : zs : as) _ -> err $ "Unknown option " <> ('-':xs) go opts (f : as) = go opts{ files = files opts ++ [f] } as err :: String -> IO a err msg = do hPutStrLn stderr msg exitWith $ ExitFailure 1 main :: IO () main = do opts <- getArgs >>= parseOpts bs <- case files opts of [] -> B.getContents fs -> mconcat <$> mapM B.readFile fs let popts = ParseOptions { sourcePositions = sourcePos opts } let ropts = RenderOptions { preserveSoftBreaks = wrap opts == Preserve } case parseDoc popts bs of Right doc -> do case format opts of Html -> hPutBuilder stdout $ renderHtml ropts doc Djot -> TIO.putStr $ render (case wrap opts of NoWrap -> Nothing Preserve -> Nothing Auto -> Just (columns opts)) $ renderDjot ropts doc Ast -> print doc exitSuccess Left e -> do hPutStrLn stderr e exitWith $ ExitFailure 1 djot-0.1.2.4/benchmark/0000755000000000000000000000000007346545000012737 5ustar0000000000000000djot-0.1.2.4/benchmark/Main.hs0000644000000000000000000000341607346545000014163 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE Strict #-} import Test.Tasty.Bench import Data.Functor.Identity -- base >= 4.8 import qualified Data.ByteString as B import Djot ( ParseOptions(..), RenderOptions(..), SourcePosOption(..), parseDoc, renderHtml, renderDjot ) import Data.ByteString.Builder ( toLazyByteString ) import Text.DocLayout (render) import System.Directory import System.FilePath (takeExtension, ()) import qualified Data.ByteString.Lazy as BL main :: IO () main = do fns <- filter ((== ".dj") . takeExtension) <$> listDirectory "benchmark" files <- mapM (\fn -> (fn,) <$> B.readFile ("benchmark" fn)) fns defaultMain $ map (\(fn, bs) -> bench ("parse " <> fn) $ whnf (parseDoc ParseOptions{ sourcePositions = NoSourcePos }) bs) files ++ map (\(fn, bs) -> bench ("parse w/ block source positions only " <> fn) $ whnf (parseDoc ParseOptions{ sourcePositions = BlockSourcePos }) bs) files ++ map (\(fn, bs) -> bench ("parse w/ source positions " <> fn) $ whnf (parseDoc ParseOptions{ sourcePositions = AllSourcePos }) bs) files ++ map (\(fn, bs) -> let doc = either error id $ parseDoc ParseOptions{ sourcePositions = NoSourcePos } bs in bench ("renderHtml " <> fn) $ nf (BL.toStrict . toLazyByteString . renderHtml RenderOptions{preserveSoftBreaks = True}) doc) files ++ map (\(fn, bs) -> let doc = either error id $ parseDoc ParseOptions{ sourcePositions = NoSourcePos } bs in bench ("renderDjot " <> fn) $ nf (render (Just 72) . renderDjot RenderOptions{preserveSoftBreaks = True}) doc) files djot-0.1.2.4/benchmark/m.dj0000644000000000000000000074726607346545000013540 0ustar0000000000000000{#synopsis} # Synopsis `pandoc` \[_options_\] \[_input-file_\]… {#description} # Description Pandoc is a [Haskell](https://www.haskell.org) library for converting from one markup format to another, and a command-line tool that uses this library. Pandoc can convert between numerous markup and word processing formats, including, but not limited to, various flavors of [Markdown](https://daringfireball.net/projects/markdown/), [HTML](https://www.w3.org/html/), [LaTeX](https://www.latex-project.org/) and [Word docx](https://en.wikipedia.org/wiki/Office_Open_XML). For the full lists of input and output formats, see the `--from` and `--to` [options below](#general-options). Pandoc can also produce [PDF](https://www.adobe.com/pdf/) output: see [creating a PDF](#creating-a-pdf), below. Pandoc’s enhanced version of Markdown includes syntax for [tables](#tables), [definition lists](#definition-lists), [metadata blocks](#metadata-blocks), [footnotes](#footnotes), [citations](#citations), [math](#math), and much more. See below under [Pandoc’s Markdown](#pandocs-markdown). Pandoc has a modular design: it consists of a set of readers, which parse text in a given format and produce a native representation of the document (an _abstract syntax tree_ or AST), and a set of writers, which convert this native representation into a target format. Thus, adding an input or output format requires only adding a reader or writer. Users can also run custom [pandoc filters](https://pandoc.org/filters.html) to modify the intermediate AST. Because pandoc’s intermediate representation of a document is less expressive than many of the formats it converts between, one should not expect perfect conversions between every format and every other. Pandoc attempts to preserve the structural elements of a document, but not formatting details such as margin size. And some document elements, such as complex tables, may not fit into pandoc’s simple document model. While conversions from pandoc’s Markdown to all formats aspire to be perfect, conversions from formats more expressive than pandoc’s Markdown can be expected to be lossy. {#using-pandoc} ## Using pandoc If no _input-files_ are specified, input is read from _stdin_. Output goes to _stdout_ by default. For output to a file, use the `-o` option: ``` pandoc -o output.html input.txt ``` By default, pandoc produces a document fragment. To produce a standalone document (e.g. a valid HTML file including `` and ``), use the `-s` or `--standalone` flag: ``` pandoc -s -o output.html input.txt ``` For more information on how standalone documents are produced, see [Templates](#templates) below. If multiple input files are given, pandoc will concatenate them all (with blank lines between them) before parsing. (Use `--file-scope` to parse files individually.) {#specifying-formats} ## Specifying formats The format of the input and output can be specified explicitly using command-line options. The input format can be specified using the `-f/--from` option, the output format using the `-t/--to` option. Thus, to convert `hello.txt` from Markdown to LaTeX, you could type: ``` pandoc -f markdown -t latex hello.txt ``` To convert `hello.html` from HTML to Markdown: ``` pandoc -f html -t markdown hello.html ``` Supported input and output formats are listed below under [Options](#options) (see `-f` for input formats and `-t` for output formats). You can also use `pandoc --list-input-formats` and `pandoc --list-output-formats` to print lists of supported formats. If the input or output format is not specified explicitly, pandoc will attempt to guess it from the extensions of the filenames. Thus, for example, ``` pandoc -o hello.tex hello.txt ``` will convert `hello.txt` from Markdown to LaTeX. If no output file is specified (so that output goes to _stdout_), or if the output file’s extension is unknown, the output format will default to HTML. If no input file is specified (so that input comes from _stdin_), or if the input files’ extensions are unknown, the input format will be assumed to be Markdown. {#character-encoding} ## Character encoding Pandoc uses the UTF-8 character encoding for both input and output. If your local character encoding is not UTF-8, you should pipe input and output through [`iconv`](https://www.gnu.org/software/libiconv/): ``` iconv -t utf-8 input.txt | pandoc | iconv -f utf-8 ``` Note that in some output formats (such as HTML, LaTeX, ConTeXt, RTF, OPML, DocBook, and Texinfo), information about the character encoding is included in the document header, which will only be included if you use the `-s/--standalone` option. {#creating-a-pdf} ## Creating a PDF To produce a PDF, specify an output file with a `.pdf` extension: ``` pandoc test.txt -o test.pdf ``` By default, pandoc will use LaTeX to create the PDF, which requires that a LaTeX engine be installed (see `--pdf-engine` below). Alternatively, pandoc can use ConTeXt, roff ms, or HTML as an intermediate format. To do this, specify an output file with a `.pdf` extension, as before, but add the `--pdf-engine` option or `-t context`, `-t html`, or `-t ms` to the command line. The tool used to generate the PDF from the intermediate format may be specified using `--pdf-engine`. You can control the PDF style using variables, depending on the intermediate format used: see [variables for LaTeX](#variables-for-latex), [variables for ConTeXt](#variables-for-context), [variables for `wkhtmltopdf`](#variables-for-wkhtmltopdf), [variables for ms](#variables-for-ms). When HTML is used as an intermediate format, the output can be styled using `--css`. To debug the PDF creation, it can be useful to look at the intermediate representation: instead of `-o test.pdf`, use for example `-s -o test.tex` to output the generated LaTeX. You can then test it with `pdflatex test.tex`. When using LaTeX, the following packages need to be available (they are included with all recent versions of [TeX Live](https://www.tug.org/texlive/)): [`amsfonts`](https://ctan.org/pkg/amsfonts), [`amsmath`](https://ctan.org/pkg/amsmath), [`lm`](https://ctan.org/pkg/lm), [`unicode-math`](https://ctan.org/pkg/unicode-math), [`iftex`](https://ctan.org/pkg/iftex), [`listings`](https://ctan.org/pkg/listings) (if the `--listings` option is used), [`fancyvrb`](https://ctan.org/pkg/fancyvrb), [`longtable`](https://ctan.org/pkg/longtable), [`booktabs`](https://ctan.org/pkg/booktabs), [`graphicx`](https://ctan.org/pkg/graphicx) (if the document contains images), [`hyperref`](https://ctan.org/pkg/hyperref), [`xcolor`](https://ctan.org/pkg/xcolor), [`soul`](https://ctan.org/pkg/soul), [`geometry`](https://ctan.org/pkg/geometry) (with the `geometry` variable set), [`setspace`](https://ctan.org/pkg/setspace) (with `linestretch`), and [`babel`](https://ctan.org/pkg/babel) (with `lang`). If `CJKmainfont` is set, [`xeCJK`](https://ctan.org/pkg/xecjk) is needed. The use of `xelatex` or `lualatex` as the PDF engine requires [`fontspec`](https://ctan.org/pkg/fontspec). `lualatex` uses [`selnolig`](https://ctan.org/pkg/selnolig). `xelatex` uses [`bidi`](https://ctan.org/pkg/bidi) (with the `dir` variable set). If the `mathspec` variable is set, `xelatex` will use [`mathspec`](https://ctan.org/pkg/mathspec) instead of [`unicode-math`](https://ctan.org/pkg/unicode-math). The [`upquote`](https://ctan.org/pkg/upquote) and [`microtype`](https://ctan.org/pkg/microtype) packages are used if available, and [`csquotes`](https://ctan.org/pkg/csquotes) will be used for [typography](#typography) if the `csquotes` variable or metadata field is set to a true value. The [`natbib`](https://ctan.org/pkg/natbib), [`biblatex`](https://ctan.org/pkg/biblatex), [`bibtex`](https://ctan.org/pkg/bibtex), and [`biber`](https://ctan.org/pkg/biber) packages can optionally be used for [citation rendering](#citation-rendering). The following packages will be used to improve output quality if present, but pandoc does not require them to be present: [`upquote`](https://ctan.org/pkg/upquote) (for straight quotes in verbatim environments), [`microtype`](https://ctan.org/pkg/microtype) (for better spacing adjustments), [`parskip`](https://ctan.org/pkg/parskip) (for better inter-paragraph spaces), [`xurl`](https://ctan.org/pkg/xurl) (for better line breaks in URLs), [`bookmark`](https://ctan.org/pkg/bookmark) (for better PDF bookmarks), and [`footnotehyper`](https://ctan.org/pkg/footnotehyper) or [`footnote`](https://ctan.org/pkg/footnote) (to allow footnotes in tables). {#reading-from-the-web} ## Reading from the Web Instead of an input file, an absolute URI may be given. In this case pandoc will fetch the content using HTTP: ``` pandoc -f html -t markdown https://www.fsf.org ``` It is possible to supply a custom User-Agent string or other header when requesting a document from a URL: ``` pandoc -f html -t markdown --request-header User-Agent:"Mozilla/5.0" \ https://www.fsf.org ``` {#options} # Options {#general-options .options} ## General options : `-f` _FORMAT_, `-r` _FORMAT_, `--from=`_FORMAT_, `--read=`_FORMAT_ Specify input format. _FORMAT_ can be: {#input-formats} ::: - `bibtex` ([BibTeX](https://ctan.org/pkg/bibtex) bibliography) - `biblatex` ([BibLaTeX](https://ctan.org/pkg/biblatex) bibliography) - `commonmark` ([CommonMark](https://commonmark.org) Markdown) - `commonmark_x` ([CommonMark](https://commonmark.org) Markdown with extensions) - `creole` ([Creole 1.0](http://www.wikicreole.org/wiki/Creole1.0)) - `csljson` ([CSL JSON](https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html) bibliography) - `csv` ([CSV](https://tools.ietf.org/html/rfc4180) table) - `tsv` ([TSV](https://www.iana.org/assignments/media-types/text/tab-separated-values) table) - `docbook` ([DocBook](https://docbook.org)) - `docx` ([Word docx](https://en.wikipedia.org/wiki/Office_Open_XML)) - `dokuwiki` ([DokuWiki markup](https://www.dokuwiki.org/dokuwiki)) - `endnotexml` ([EndNote XML bibliography](https://support.clarivate.com/Endnote/s/article/EndNote-XML-Document-Type-Definition)) - `epub` ([EPUB](http://idpf.org/epub)) - `fb2` ([FictionBook2](http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1) e-book) - `gfm` ([GitHub-Flavored Markdown](https://help.github.com/articles/github-flavored-markdown/)), or the deprecated and less accurate `markdown_github`; use [`markdown_github`](#markdown-variants) only if you need extensions not supported in [`gfm`](#markdown-variants). - `haddock` ([Haddock markup](https://www.haskell.org/haddock/doc/html/ch03s08.html)) - `html` ([HTML](https://www.w3.org/html/)) - `ipynb` ([Jupyter notebook](https://nbformat.readthedocs.io/en/latest/)) - `jats` ([JATS](https://jats.nlm.nih.gov) XML) - `jira` ([Jira](https://jira.atlassian.com/secure/WikiRendererHelpAction.jspa?section=all)/Confluence wiki markup) - `json` (JSON version of native AST) - `latex` ([LaTeX](https://www.latex-project.org/)) - `markdown` ([Pandoc’s Markdown](#pandocs-markdown)) - `markdown_mmd` ([MultiMarkdown](https://fletcherpenney.net/multimarkdown/)) - `markdown_phpextra` ([PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/)) - `markdown_strict` (original unextended [Markdown](https://daringfireball.net/projects/markdown/)) - `mediawiki` ([MediaWiki markup](https://www.mediawiki.org/wiki/Help:Formatting)) - `man` ([roff man](https://man.cx/groff_man(7))) - `muse` ([Muse](https://amusewiki.org/library/manual)) - `native` (native Haskell) - `odt` ([ODT](https://en.wikipedia.org/wiki/OpenDocument)) - `opml` ([OPML](http://dev.opml.org/spec2.html)) - `org` ([Emacs Org mode](https://orgmode.org)) - `ris` ([RIS](https://en.wikipedia.org/wiki/RIS_(file_format)) bibliography) - `rtf` ([Rich Text Format](https://en.wikipedia.org/wiki/Rich_Text_Format)) - `rst` ([reStructuredText](https://docutils.sourceforge.io/docs/ref/rst/introduction.html)) - `t2t` ([txt2tags](https://txt2tags.org)) - `textile` ([Textile](https://textile-lang.com)) - `tikiwiki` ([TikiWiki markup](https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax)) - `twiki` ([TWiki markup](https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules)) - `typst` ([typst](https://typst.app)) - `vimwiki` ([Vimwiki](https://vimwiki.github.io)) - the path of a custom Lua reader, see [Custom readers and writers](#custom-readers-and-writers) below ::: Extensions can be individually enabled or disabled by appending `+EXTENSION` or `-EXTENSION` to the format name. See [Extensions](#extensions) below, for a list of extensions and their names. See `--list-input-formats` and `--list-extensions`, below. : `-t` _FORMAT_, `-w` _FORMAT_, `--to=`_FORMAT_, `--write=`_FORMAT_ Specify output format. _FORMAT_ can be: {#output-formats} ::: - `asciidoc` (modern [AsciiDoc](https://www.methods.co.nz/asciidoc/) as interpreted by [AsciiDoctor](https://asciidoctor.org/)) - `asciidoc_legacy` ([AsciiDoc](https://www.methods.co.nz/asciidoc/) as interpreted by [`asciidoc-py`](https://github.com/asciidoc-py/asciidoc-py)). - `asciidoctor` (deprecated synonym for `asciidoc`) - `beamer` ([LaTeX beamer](https://ctan.org/pkg/beamer) slide show) - `bibtex` ([BibTeX](https://ctan.org/pkg/bibtex) bibliography) - `biblatex` ([BibLaTeX](https://ctan.org/pkg/biblatex) bibliography) - `chunkedhtml` (zip archive of multiple linked HTML files) - `commonmark` ([CommonMark](https://commonmark.org) Markdown) - `commonmark_x` ([CommonMark](https://commonmark.org) Markdown with extensions) - `context` ([ConTeXt](https://www.contextgarden.net/)) - `csljson` ([CSL JSON](https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html) bibliography) - `docbook` or `docbook4` ([DocBook](https://docbook.org) 4) - `docbook5` (DocBook 5) - `docx` ([Word docx](https://en.wikipedia.org/wiki/Office_Open_XML)) - `dokuwiki` ([DokuWiki markup](https://www.dokuwiki.org/dokuwiki)) - `epub` or `epub3` ([EPUB](http://idpf.org/epub) v3 book) - `epub2` (EPUB v2) - `fb2` ([FictionBook2](http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1) e-book) - `gfm` ([GitHub-Flavored Markdown](https://help.github.com/articles/github-flavored-markdown/)), or the deprecated and less accurate `markdown_github`; use [`markdown_github`](#markdown-variants) only if you need extensions not supported in [`gfm`](#markdown-variants). - `haddock` ([Haddock markup](https://www.haskell.org/haddock/doc/html/ch03s08.html)) - `html` or `html5` ([HTML](https://www.w3.org/html/), i.e. [HTML5](https://html.spec.whatwg.org/)/XHTML [polyglot markup](https://www.w3.org/TR/html-polyglot/)) - `html4` ([XHTML](https://www.w3.org/TR/xhtml1/) 1.0 Transitional) - `icml` ([InDesign ICML](https://wwwimages.adobe.com/www.adobe.com/content/dam/acom/en/devnet/indesign/sdk/cs6/idml/idml-cookbook.pdf)) - `ipynb` ([Jupyter notebook](https://nbformat.readthedocs.io/en/latest/)) - `jats_archiving` ([JATS](https://jats.nlm.nih.gov) XML, Archiving and Interchange Tag Set) - `jats_articleauthoring` ([JATS](https://jats.nlm.nih.gov) XML, Article Authoring Tag Set) - `jats_publishing` ([JATS](https://jats.nlm.nih.gov) XML, Journal Publishing Tag Set) - `jats` (alias for `jats_archiving`) - `jira` ([Jira](https://jira.atlassian.com/secure/WikiRendererHelpAction.jspa?section=all)/Confluence wiki markup) - `json` (JSON version of native AST) - `latex` ([LaTeX](https://www.latex-project.org/)) - `man` ([roff man](https://man.cx/groff_man(7))) - `markdown` ([Pandoc’s Markdown](#pandocs-markdown)) - `markdown_mmd` ([MultiMarkdown](https://fletcherpenney.net/multimarkdown/)) - `markdown_phpextra` ([PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/)) - `markdown_strict` (original unextended [Markdown](https://daringfireball.net/projects/markdown/)) - `markua` ([Markua](https://leanpub.com/markua/read)) - `mediawiki` ([MediaWiki markup](https://www.mediawiki.org/wiki/Help:Formatting)) - `ms` ([roff ms](https://man.cx/groff_ms(7))) - `muse` ([Muse](https://amusewiki.org/library/manual)) - `native` (native Haskell) - `odt` ([OpenOffice text document](https://en.wikipedia.org/wiki/OpenDocument)) - `opml` ([OPML](http://dev.opml.org/spec2.html)) - `opendocument` ([OpenDocument](http://opendocument.xml.org)) - `org` ([Emacs Org mode](https://orgmode.org)) - `pdf` ([PDF](https://www.adobe.com/pdf/)) - `plain` (plain text) - `pptx` ([PowerPoint](https://en.wikipedia.org/wiki/Microsoft_PowerPoint) slide show) - `rst` ([reStructuredText](https://docutils.sourceforge.io/docs/ref/rst/introduction.html)) - `rtf` ([Rich Text Format](https://en.wikipedia.org/wiki/Rich_Text_Format)) - `texinfo` ([GNU Texinfo](https://www.gnu.org/software/texinfo/)) - `textile` ([Textile](https://textile-lang.com)) - `slideous` ([Slideous](https://goessner.net/articles/slideous/) HTML and JavaScript slide show) - `slidy` ([Slidy](https://www.w3.org/Talks/Tools/Slidy2/) HTML and JavaScript slide show) - `dzslides` ([DZSlides](https://paulrouget.com/dzslides/) HTML5 + JavaScript slide show) - `revealjs` ([reveal.js](https://revealjs.com/) HTML5 + JavaScript slide show) - `s5` ([S5](https://meyerweb.com/eric/tools/s5/) HTML and JavaScript slide show) - `tei` ([TEI Simple](https://github.com/TEIC/TEI-Simple)) - `typst` ([typst](https://typst.app)) - `xwiki` ([XWiki markup](https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/)) - `zimwiki` ([ZimWiki markup](https://zim-wiki.org/manual/Help/Wiki_Syntax.html)) - the path of a custom Lua writer, see [Custom readers and writers](#custom-readers-and-writers) below ::: Note that `odt`, `docx`, `epub`, and `pdf` output will not be directed to _stdout_ unless forced with `-o -`. Extensions can be individually enabled or disabled by appending `+EXTENSION` or `-EXTENSION` to the format name. See [Extensions](#extensions) below, for a list of extensions and their names. See `--list-output-formats` and `--list-extensions`, below. : `-o` _FILE_, `--output=`_FILE_ Write output to _FILE_ instead of _stdout_. If _FILE_ is `-`, output will go to _stdout_, even if a non-textual format (`docx`, `odt`, `epub2`, `epub3`) is specified. If the output format is `chunkedhtml` and _FILE_ has no extension, then instead of producing a `.zip` file pandoc will create a directory _FILE_ and unpack the zip archive there (unless _FILE_ already exists, in which case an error will be raised). : `--data-dir=`_DIRECTORY_ Specify the user data directory to search for pandoc data files. If this option is not specified, the default user data directory will be used. On \*nix and macOS systems this will be the `pandoc` subdirectory of the XDG data directory (by default, `$HOME/.local/share`, overridable by setting the `XDG_DATA_HOME` environment variable). If that directory does not exist and `$HOME/.pandoc` exists, it will be used (for backwards compatibility). On Windows the default user data directory is `%APPDATA%\pandoc`. You can find the default user data directory on your system by looking at the output of `pandoc --version`. Data files placed in this directory (for example, `reference.odt`, `reference.docx`, `epub.css`, `templates`) will override pandoc’s normal defaults. (Note that the user data directory is not created by pandoc, so you will need to create it yourself if you want to make use of it.) : `-d` _FILE_, `--defaults=`_FILE_ Specify a set of default option settings. _FILE_ is a YAML file whose fields correspond to command-line option settings. All options for document conversion, including input and output files, can be set using a defaults file. The file will be searched for first in the working directory, and then in the `defaults` subdirectory of the user data directory (see `--data-dir`). The `.yaml` extension may be omitted. See the section [Defaults files](#defaults-files) for more information on the file format. Settings from the defaults file may be overridden or extended by subsequent options on the command line. : `--bash-completion` Generate a bash completion script. To enable bash completion with pandoc, add this to your `.bashrc`: ``` eval "$(pandoc --bash-completion)" ``` : `--verbose` Give verbose debugging output. : `--quiet` Suppress warning messages. : `--fail-if-warnings[=true|false]` Exit with error status if there are any warnings. : `--log=`_FILE_ Write log messages in machine-readable JSON format to _FILE_. All messages above DEBUG level will be written, regardless of verbosity settings (`--verbose`, `--quiet`). : `--list-input-formats` List supported input formats, one per line. : `--list-output-formats` List supported output formats, one per line. : `--list-extensions`\[`=`_FORMAT_\] List supported extensions for _FORMAT_, one per line, preceded by a `+` or `-` indicating whether it is enabled by default in _FORMAT_. If _FORMAT_ is not specified, defaults for pandoc’s Markdown are given. : `--list-highlight-languages` List supported languages for syntax highlighting, one per line. : `--list-highlight-styles` List supported styles for syntax highlighting, one per line. See `--highlight-style`. : `-v`, `--version` Print version. : `-h`, `--help` Show usage message. {#reader-options .options} ## Reader options : `--shift-heading-level-by=`_NUMBER_ Shift heading levels by a positive or negative integer. For example, with `--shift-heading-level-by=-1`, level 2 headings become level 1 headings, and level 3 headings become level 2 headings. Headings cannot have a level less than 1, so a heading that would be shifted below level 1 becomes a regular paragraph. Exception: with a shift of -N, a level-N heading at the beginning of the document replaces the metadata title. `--shift-heading-level-by=-1` is a good choice when converting HTML or Markdown documents that use an initial level-1 heading for the document title and level-2+ headings for sections. `--shift-heading-level-by=1` may be a good choice for converting Markdown documents that use level-1 headings for sections to HTML, since pandoc uses a level-1 heading to render the document title. : `--base-header-level=`_NUMBER_ _Deprecated. Use `--shift-heading-level-by`=X instead, where X = NUMBER - 1._ Specify the base level for headings (defaults to 1). : `--indented-code-classes=`_CLASSES_ Specify classes to use for indented code blocks–for example, `perl,numberLines` or `haskell`. Multiple classes may be separated by spaces or commas. : `--default-image-extension=`_EXTENSION_ Specify a default extension to use when image paths/URLs have no extension. This allows you to use the same source for formats that require different kinds of images. Currently this option only affects the Markdown and LaTeX readers. : `--file-scope[=true|false]` Parse each file individually before combining for multifile documents. This will allow footnotes in different files with the same identifiers to work as expected. If this option is set, footnotes and links will not work across files. Reading binary files (docx, odt, epub) implies `--file-scope`. If two or more files are processed using `--file-scope`, prefixes based on the filenames will be added to identifiers in order to disambiguate them, and internal links will be adjusted accordingly. For example, a header with identifier `foo` in `subdir/file1.txt` will have its identifier changed to `subdir__file1.txt__foo`. In addition, a Div with an identifier based on the filename will be added around the file’s content, so that internal links to the filename will point to this Div’s identifier. : `-F` _PROGRAM_, `--filter=`_PROGRAM_ Specify an executable to be used as a filter transforming the pandoc AST after the input is parsed and before the output is written. The executable should read JSON from stdin and write JSON to stdout. The JSON must be formatted like pandoc’s own JSON input and output. The name of the output format will be passed to the filter as the first argument. Hence, ``` pandoc --filter ./caps.py -t latex ``` is equivalent to ``` pandoc -t json | ./caps.py latex | pandoc -f json -t latex ``` The latter form may be useful for debugging filters. Filters may be written in any language. `Text.Pandoc.JSON` exports `toJSONFilter` to facilitate writing filters in Haskell. Those who would prefer to write filters in python can use the module [`pandocfilters`](https://github.com/jgm/pandocfilters), installable from PyPI. There are also pandoc filter libraries in [PHP](https://github.com/vinai/pandocfilters-php), [perl](https://metacpan.org/pod/Pandoc::Filter), and [JavaScript/node.js](https://github.com/mvhenderson/pandoc-filter-node). In order of preference, pandoc will look for filters in 1. a specified full or relative path (executable or non-executable), 2. `$DATADIR/filters` (executable or non-executable) where `$DATADIR` is the user data directory (see `--data-dir`, above), 3. `$PATH` (executable only). Filters, Lua-filters, and citeproc processing are applied in the order specified on the command line. : `-L` _SCRIPT_, `--lua-filter=`_SCRIPT_ Transform the document in a similar fashion as JSON filters (see `--filter`), but use pandoc’s built-in Lua filtering system. The given Lua script is expected to return a list of Lua filters which will be applied in order. Each Lua filter must contain element-transforming functions indexed by the name of the AST element on which the filter function should be applied. The `pandoc` Lua module provides helper functions for element creation. It is always loaded into the script’s Lua environment. See the [Lua filters documentation](https://pandoc.org/lua-filters.html) for further details. In order of preference, pandoc will look for Lua filters in 1. a specified full or relative path, 2. `$DATADIR/filters` where `$DATADIR` is the user data directory (see `--data-dir`, above). Filters, Lua filters, and citeproc processing are applied in the order specified on the command line. : `-M` _KEY_\[`=`_VAL_\], `--metadata=`_KEY_\[`:`_VAL_\] Set the metadata field _KEY_ to the value _VAL_. A value specified on the command line overrides a value specified in the document using [YAML metadata blocks](#extension-yaml_metadata_block). Values will be parsed as YAML boolean or string values. If no value is specified, the value will be treated as Boolean true. Like `--variable`, `--metadata` causes template variables to be set. But unlike `--variable`, `--metadata` affects the metadata of the underlying document (which is accessible from filters and may be printed in some output formats) and metadata values will be escaped when inserted into the template. : `--metadata-file=`_FILE_ Read metadata from the supplied YAML (or JSON) file. This option can be used with every input format, but string scalars in the metadata file will always be parsed as Markdown. (If the input format is Markdown or a Markdown variant, then the same variant will be used to parse the metadata file; if it is a non-Markdown format, pandoc’s default Markdown extensions will be used.) This option can be used repeatedly to include multiple metadata files; values in files specified later on the command line will be preferred over those specified in earlier files. Metadata values specified inside the document, or by using `-M`, overwrite values specified with this option. The file will be searched for first in the working directory, and then in the `metadata` subdirectory of the user data directory (see `--data-dir`). : `-p`, `--preserve-tabs[=true|false]` Preserve tabs instead of converting them to spaces. (By default, pandoc converts tabs to spaces before parsing its input.) Note that this will only affect tabs in literal code spans and code blocks. Tabs in regular text are always treated as spaces. : `--tab-stop=`_NUMBER_ Specify the number of spaces per tab (default is 4). : `--track-changes=accept`|`reject`|`all` Specifies what to do with insertions, deletions, and comments produced by the MS Word "Track Changes" feature. `accept` (the default) processes all the insertions and deletions. `reject` ignores them. Both `accept` and `reject` ignore comments. `all` includes all insertions, deletions, and comments, wrapped in spans with `insertion`, `deletion`, `comment-start`, and `comment-end` classes, respectively. The author and time of change is included. `all` is useful for scripting: only accepting changes from a certain reviewer, say, or before a certain date. If a paragraph is inserted or deleted, `track-changes=all` produces a span with the class `paragraph-insertion`/`paragraph-deletion` before the affected paragraph break. This option only affects the docx reader. : `--extract-media=`_DIR_ Extract images and other media contained in or linked from the source document to the path _DIR_, creating it if necessary, and adjust the images references in the document so they point to the extracted files. Media are downloaded, read from the file system, or extracted from a binary container (e.g. docx), as needed. The original file paths are used if they are relative paths not containing `..`. Otherwise filenames are constructed from the SHA1 hash of the contents. : `--abbreviations=`_FILE_ Specifies a custom abbreviations file, with abbreviations one to a line. If this option is not specified, pandoc will read the data file `abbreviations` from the user data directory or fall back on a system default. To see the system default, use `pandoc --print-default-data-file=abbreviations`. The only use pandoc makes of this list is in the Markdown reader. Strings found in this list will be followed by a nonbreaking space, and the period will not produce sentence-ending space in formats like LaTeX. The strings may not contain spaces. : `--trace[=true|false]` Print diagnostic output tracing parser progress to stderr. This option is intended for use by developers in diagnosing performance issues. {#general-writer-options .options} ## General writer options : `-s`, `--standalone` Produce output with an appropriate header and footer (e.g. a standalone HTML, LaTeX, TEI, or RTF file, not a fragment). This option is set automatically for `pdf`, `epub`, `epub3`, `fb2`, `docx`, and `odt` output. For `native` output, this option causes metadata to be included; otherwise, metadata is suppressed. : `--template=`_FILE_|_URL_ Use the specified file as a custom template for the generated document. Implies `--standalone`. See [Templates](#templates), below, for a description of template syntax. If no extension is specified, an extension corresponding to the writer will be added, so that `--template=special` looks for `special.html` for HTML output. If the template is not found, pandoc will search for it in the `templates` subdirectory of the user data directory (see `--data-dir`). If this option is not used, a default template appropriate for the output format will be used (see `-D/--print-default-template`). : `-V` _KEY_\[`=`_VAL_\], `--variable=`_KEY_\[`:`_VAL_\] Set the template variable _KEY_ to the value _VAL_ when rendering the document in standalone mode. If no _VAL_ is specified, the key will be given the value `true`. : `--sandbox[=true|false]` Run pandoc in a sandbox, limiting IO operations in readers and writers to reading the files specified on the command line. Note that this option does not limit IO operations by filters or in the production of PDF documents. But it does offer security against, for example, disclosure of files through the use of `include` directives. Anyone using pandoc on untrusted user input should use this option. Note: some readers and writers (e.g., `docx`) need access to data files. If these are stored on the file system, then pandoc will not be able to find them when run in `--sandbox` mode and will raise an error. For these applications, we recommend using a pandoc binary compiled with the `embed_data_files` option, which causes the data files to be baked into the binary instead of being stored on the file system. : `-D` _FORMAT_, `--print-default-template=`_FORMAT_ Print the system default template for an output _FORMAT_. (See `-t` for a list of possible _FORMAT_s.) Templates in the user data directory are ignored. This option may be used with `-o`/`--output` to redirect output to a file, but `-o`/`--output` must come before `--print-default-template` on the command line. Note that some of the default templates use partials, for example `styles.html`. To print the partials, use `--print-default-data-file`: for example, `--print-default-data-file=templates/styles.html`. : `--print-default-data-file=`_FILE_ Print a system default data file. Files in the user data directory are ignored. This option may be used with `-o`/`--output` to redirect output to a file, but `-o`/`--output` must come before `--print-default-data-file` on the command line. : `--eol=crlf`|`lf`|`native` Manually specify line endings: `crlf` (Windows), `lf` (macOS/Linux/UNIX), or `native` (line endings appropriate to the OS on which pandoc is being run). The default is `native`. : `--dpi`=_NUMBER_ Specify the default dpi (dots per inch) value for conversion from pixels to inch/centimeters and vice versa. (Technically, the correct term would be ppi: pixels per inch.) The default is 96dpi. When images contain information about dpi internally, the encoded value is used instead of the default specified by this option. : `--wrap=auto`|`none`|`preserve` Determine how text is wrapped in the output (the source code, not the rendered version). With `auto` (the default), pandoc will attempt to wrap lines to the column width specified by `--columns` (default 72). With `none`, pandoc will not wrap lines at all. With `preserve`, pandoc will attempt to preserve the wrapping from the source document (that is, where there are nonsemantic newlines in the source, there will be nonsemantic newlines in the output as well). In `ipynb` output, this option affects wrapping of the contents of markdown cells. : `--columns=`_NUMBER_ Specify length of lines in characters. This affects text wrapping in the generated source code (see `--wrap`). It also affects calculation of column widths for plain text tables (see [Tables](#tables) below). : `--toc[=true|false]`, `--table-of-contents[=true|false]` Include an automatically generated table of contents (or, in the case of `latex`, `context`, `docx`, `odt`, `opendocument`, `rst`, or `ms`, an instruction to create one) in the output document. This option has no effect unless `-s/--standalone` is used, and it has no effect on `man`, `docbook4`, `docbook5`, or `jats` output. Note that if you are producing a PDF via `ms`, the table of contents will appear at the beginning of the document, before the title. If you would prefer it to be at the end of the document, use the option `--pdf-engine-opt=--no-toc-relocation`. : `--toc-depth=`_NUMBER_ Specify the number of section levels to include in the table of contents. The default is 3 (which means that level-1, 2, and 3 headings will be listed in the contents). : `--strip-comments[=true|false]` Strip out HTML comments in the Markdown or Textile source, rather than passing them on to Markdown, Textile or HTML output as raw HTML. This does not apply to HTML comments inside raw HTML blocks when the `markdown_in_html_blocks` extension is not set. : `--no-highlight` Disables syntax highlighting for code blocks and inlines, even when a language attribute is given. : `--highlight-style=`_STYLE_|_FILE_ Specifies the coloring style to be used in highlighted source code. Options are `pygments` (the default), `kate`, `monochrome`, `breezeDark`, `espresso`, `zenburn`, `haddock`, and `tango`. For more information on syntax highlighting in pandoc, see [Syntax highlighting](#syntax-highlighting), below. See also `--list-highlight-styles`. Instead of a _STYLE_ name, a JSON file with extension `.theme` may be supplied. This will be parsed as a KDE syntax highlighting theme and (if valid) used as the highlighting style. To generate the JSON version of an existing style, use `--print-highlight-style`. : `--print-highlight-style=`_STYLE_|_FILE_ Prints a JSON version of a highlighting style, which can be modified, saved with a `.theme` extension, and used with `--highlight-style`. This option may be used with `-o`/`--output` to redirect output to a file, but `-o`/`--output` must come before `--print-highlight-style` on the command line. : `--syntax-definition=`_FILE_ Instructs pandoc to load a KDE XML syntax definition file, which will be used for syntax highlighting of appropriately marked code blocks. This can be used to add support for new languages or to use altered syntax definitions for existing languages. This option may be repeated to add multiple syntax definitions. : `-H` _FILE_, `--include-in-header=`_FILE_|_URL_ Include contents of _FILE_, verbatim, at the end of the header. This can be used, for example, to include special CSS or JavaScript in HTML documents. This option can be used repeatedly to include multiple files in the header. They will be included in the order specified. Implies `--standalone`. : `-B` _FILE_, `--include-before-body=`_FILE_|_URL_ Include contents of _FILE_, verbatim, at the beginning of the document body (e.g. after the `` tag in HTML, or the `\begin{document}` command in LaTeX). This can be used to include navigation bars or banners in HTML documents. This option can be used repeatedly to include multiple files. They will be included in the order specified. Implies `--standalone`. : `-A` _FILE_, `--include-after-body=`_FILE_|_URL_ Include contents of _FILE_, verbatim, at the end of the document body (before the `` tag in HTML, or the `\end{document}` command in LaTeX). This option can be used repeatedly to include multiple files. They will be included in the order specified. Implies `--standalone`. : `--resource-path=`_SEARCHPATH_ List of paths to search for images and other resources. The paths should be separated by `:` on Linux, UNIX, and macOS systems, and by `;` on Windows. If `--resource-path` is not specified, the default resource path is the working directory. Note that, if `--resource-path` is specified, the working directory must be explicitly listed or it will not be searched. For example: `--resource-path=.:test` will search the working directory and the `test` subdirectory, in that order. This option can be used repeatedly. Search path components that come later on the command line will be searched before those that come earlier, so `--resource-path foo:bar --resource-path baz:bim` is equivalent to `--resource-path baz:bim:foo:bar`. : `--request-header=`_NAME_`:`_VAL_ Set the request header _NAME_ to the value _VAL_ when making HTTP requests (for example, when a URL is given on the command line, or when resources used in a document must be downloaded). If you’re behind a proxy, you also need to set the environment variable `http_proxy` to `http://...`. : `--no-check-certificate[=true|false]` Disable the certificate verification to allow access to unsecure HTTP resources (for example when the certificate is no longer valid or self signed). {#options-affecting-specific-writers .options} ## Options affecting specific writers : `--self-contained[=true|false]` _Deprecated synonym for `--embed-resources --standalone`._ : `--embed-resources[=true|false]` Produce a standalone HTML file with no external dependencies, using `data:` URIs to incorporate the contents of linked scripts, stylesheets, images, and videos. The resulting file should be "self-contained," in the sense that it needs no external files and no net access to be displayed properly by a browser. This option works only with HTML output formats, including `html4`, `html5`, `html+lhs`, `html5+lhs`, `s5`, `slidy`, `slideous`, `dzslides`, and `revealjs`. Scripts, images, and stylesheets at absolute URLs will be downloaded; those at relative URLs will be sought relative to the working directory (if the first source file is local) or relative to the base URL (if the first source file is remote). Elements with the attribute `data-external="1"` will be left alone; the documents they link to will not be incorporated in the document. Limitation: resources that are loaded dynamically through JavaScript cannot be incorporated; as a result, fonts may be missing when `--mathjax` is used, and some advanced features (e.g. zoom or speaker notes) may not work in an offline "self-contained" `reveal.js` slide show. : `--html-q-tags[=true|false]` Use `` tags for quotes in HTML. (This option only has an effect if the `smart` extension is enabled for the input format used.) : `--ascii[=true|false]` Use only ASCII characters in output. Currently supported for XML and HTML formats (which use entities instead of UTF-8 when this option is selected), CommonMark, gfm, and Markdown (which use entities), roff man and ms (which use hexadecimal escapes), and to a limited degree LaTeX (which uses standard commands for accented characters when possible). : `--reference-links[=true|false]` Use reference-style links, rather than inline links, in writing Markdown or reStructuredText. By default inline links are used. The placement of link references is affected by the `--reference-location` option. : `--reference-location=block`|`section`|`document` Specify whether footnotes (and references, if `reference-links` is set) are placed at the end of the current (top-level) block, the current section, or the document. The default is `document`. Currently this option only affects the `markdown`, `muse`, `html`, `epub`, `slidy`, `s5`, `slideous`, `dzslides`, and `revealjs` writers. In slide formats, specifying `--reference-location=section` will cause notes to be rendered at the bottom of a slide. : `--markdown-headings=setext`|`atx` Specify whether to use ATX-style (`#`-prefixed) or Setext-style (underlined) headings for level 1 and 2 headings in Markdown output. (The default is `atx`.) ATX-style headings are always used for levels 3+. This option also affects Markdown cells in `ipynb` output. : `--list-tables[=true|false]` Render tables as list tables in RST output. : `--top-level-division=default`|`section`|`chapter`|`part` Treat top-level headings as the given division type in LaTeX, ConTeXt, DocBook, and TEI output. The hierarchy order is part, chapter, then section; all headings are shifted such that the top-level heading becomes the specified type. The default behavior is to determine the best division type via heuristics: unless other conditions apply, `section` is chosen. When the `documentclass` variable is set to `report`, `book`, or `memoir` (unless the `article` option is specified), `chapter` is implied as the setting for this option. If `beamer` is the output format, specifying either `chapter` or `part` will cause top-level headings to become `\part{..}`, while second-level headings remain as their default type. : `-N`, `--number-sections` Number section headings in LaTeX, ConTeXt, HTML, Docx, ms, or EPUB output. By default, sections are not numbered. Sections with class `unnumbered` will never be numbered, even if `--number-sections` is specified. : `--number-offset=`_NUMBER_\[`,`_NUMBER_`,`_…_\] Offset for section headings in HTML output (ignored in other output formats). The first number is added to the section number for top-level headings, the second for second-level headings, and so on. So, for example, if you want the first top-level heading in your document to be numbered "6", specify `--number-offset=5`. If your document starts with a level-2 heading which you want to be numbered "1.5", specify `--number-offset=1,4`. Offsets are 0 by default. Implies `--number-sections`. : `--listings[=true|false]` Use the [`listings`](https://ctan.org/pkg/listings) package for LaTeX code blocks. The package does not support multi-byte encoding for source code. To handle UTF-8 you would need to use a custom template. This issue is fully documented here: [Encoding issue with the listings package](https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings#Encoding_issue). : `-i`, `--incremental[=true|false]` Make list items in slide shows display incrementally (one by one). The default is for lists to be displayed all at once. : `--slide-level=`_NUMBER_ Specifies that headings with the specified level create slides (for `beamer`, `s5`, `slidy`, `slideous`, `dzslides`). Headings above this level in the hierarchy are used to divide the slide show into sections; headings below this level create subheads within a slide. Valid values are 0-6. If a slide level of 0 is specified, slides will not be split automatically on headings, and horizontal rules must be used to indicate slide boundaries. If a slide level is not specified explicitly, the slide level will be set automatically based on the contents of the document; see [Structuring the slide show](#structuring-the-slide-show). : `--section-divs[=true|false]` Wrap sections in `
` tags (or `
` tags for `html4`), and attach identifiers to the enclosing `
` (or `
`) rather than the heading itself (see [Heading identifiers](#heading-identifiers), below). This option only affects HTML output (and does not affect HTML slide formats). : `--email-obfuscation=none`|`javascript`|`references` Specify a method for obfuscating `mailto:` links in HTML documents. `none` leaves `mailto:` links as they are. `javascript` obfuscates them using JavaScript. `references` obfuscates them by printing their letters as decimal or hexadecimal character references. The default is `none`. : `--id-prefix=`_STRING_ Specify a prefix to be added to all identifiers and internal links in HTML and DocBook output, and to footnote numbers in Markdown and Haddock output. This is useful for preventing duplicate identifiers when generating fragments to be included in other pages. : `-T` _STRING_, `--title-prefix=`_STRING_ Specify _STRING_ as a prefix at the beginning of the title that appears in the HTML header (but not in the title as it appears at the beginning of the HTML body). Implies `--standalone`. : `-c` _URL_, `--css=`_URL_ Link to a CSS style sheet. This option can be used repeatedly to include multiple files. They will be included in the order specified. This option only affects HTML (including HTML slide shows) and EPUB output. It should be used together with `-s/--standalone`, because the link to the stylesheet goes in the document header. A stylesheet is required for generating EPUB. If none is provided using this option (or the `css` or `stylesheet` metadata fields), pandoc will look for a file `epub.css` in the user data directory (see `--data-dir`). If it is not found there, sensible defaults will be used. : `--reference-doc=`_FILE_|_URL_ Use the specified file as a style reference in producing a docx or ODT file. : Docx For best results, the reference docx should be a modified version of a docx file produced using pandoc. The contents of the reference docx are ignored, but its stylesheets and document properties (including margins, page size, header, and footer) are used in the new docx. If no reference docx is specified on the command line, pandoc will look for a file `reference.docx` in the user data directory (see `--data-dir`). If this is not found either, sensible defaults will be used. To produce a custom `reference.docx`, first get a copy of the default `reference.docx`: `pandoc -o custom-reference.docx --print-default-data-file reference.docx`. Then open `custom-reference.docx` in Word, modify the styles as you wish, and save the file. For best results, do not make changes to this file other than modifying the styles used by pandoc: Paragraph styles: - Normal - Body Text - First Paragraph - Compact - Title - Subtitle - Author - Date - Abstract - AbstractTitle - Bibliography - Heading 1 - Heading 2 - Heading 3 - Heading 4 - Heading 5 - Heading 6 - Heading 7 - Heading 8 - Heading 9 - Block Text - Source Code - Footnote Text - Definition Term - Definition - Caption - Table Caption - Image Caption - Figure - Captioned Figure - TOC Heading Character styles: - Default Paragraph Font - Body Text Char - Verbatim Char - Footnote Reference - Hyperlink - Section Number Table style: - Table : ODT For best results, the reference ODT should be a modified version of an ODT produced using pandoc. The contents of the reference ODT are ignored, but its stylesheets are used in the new ODT. If no reference ODT is specified on the command line, pandoc will look for a file `reference.odt` in the user data directory (see `--data-dir`). If this is not found either, sensible defaults will be used. To produce a custom `reference.odt`, first get a copy of the default `reference.odt`: `pandoc -o custom-reference.odt --print-default-data-file reference.odt`. Then open `custom-reference.odt` in LibreOffice, modify the styles as you wish, and save the file. : PowerPoint Templates included with Microsoft PowerPoint 2013 (either with `.pptx` or `.potx` extension) are known to work, as are most templates derived from these. The specific requirement is that the template should contain layouts with the following names (as seen within PowerPoint): - Title Slide - Title and Content - Section Header - Two Content - Comparison - Content with Caption - Blank For each name, the first layout found with that name will be used. If no layout is found with one of the names, pandoc will output a warning and use the layout with that name from the default reference doc instead. (How these layouts are used is described in [PowerPoint layout choice](#powerpoint-layout-choice).) All templates included with a recent version of MS PowerPoint will fit these criteria. (You can click on `Layout` under the `Home` menu to check.) You can also modify the default `reference.pptx`: first run `pandoc -o custom-reference.pptx --print-default-data-file reference.pptx`, and then modify `custom-reference.pptx` in MS PowerPoint (pandoc will use the layouts with the names listed above). : `--split-level=`_NUMBER_ Specify the heading level at which to split an EPUB or chunked HTML document into separate files. The default is to split into chapters at level-1 headings. In the case of EPUB, this option only affects the internal composition of the EPUB, not the way chapters and sections are displayed to users. Some readers may be slow if the chapter files are too large, so for large documents with few level-1 headings, one might want to use a chapter level of 2 or 3. For chunked HTML, this option determines how much content goes in each "chunk." : `--chunk-template=`_PATHTEMPLATE_ Specify a template for the filenames in a `chunkedhtml` document. In the template, `%n` will be replaced by the chunk number (padded with leading 0s to 3 digits), `%s` with the section number of the chunk, `%h` with the heading text (with formatting removed), `%i` with the section identifier. For example, `%section-%s-%i.html` might be resolved to `section-1.1-introduction.html`. The characters `/` and `\` are not allowed in chunk templates and will be ignored. The default is `%s-%i.html`. : `--epub-chapter-level=`_NUMBER_ _Deprecated synonym for `--split-level`._ : `--epub-cover-image=`_FILE_ Use the specified image as the EPUB cover. It is recommended that the image be less than 1000px in width and height. Note that in a Markdown source document you can also specify `cover-image` in a YAML metadata block (see [EPUB Metadata](#epub-metadata), below). : `--epub-title-page=true`|`false` Determines whether a the title page is included in the EPUB (default is `true`). : `--epub-metadata=`_FILE_ Look in the specified XML file for metadata for the EPUB. The file should contain a series of [Dublin Core elements](https://www.dublincore.org/specifications/dublin-core/dces/). For example: ``` Creative Commons es-AR ``` By default, pandoc will include the following metadata elements: `` (from the document title), `` (from the document authors), `` (from the document date, which should be in [ISO 8601 format](https://www.w3.org/TR/NOTE-datetime)), `` (from the `lang` variable, or, if is not set, the locale), and `` (a randomly generated UUID). Any of these may be overridden by elements in the metadata file. Note: if the source document is Markdown, a YAML metadata block in the document can be used instead. See below under [EPUB Metadata](#epub-metadata). : `--epub-embed-font=`_FILE_ Embed the specified font in the EPUB. This option can be repeated to embed multiple fonts. Wildcards can also be used: for example, `DejaVuSans-*.ttf`. However, if you use wildcards on the command line, be sure to escape them or put the whole filename in single quotes, to prevent them from being interpreted by the shell. To use the embedded fonts, you will need to add declarations like the following to your CSS (see `--css`): ``` @font-face { font-family: DejaVuSans; font-style: normal; font-weight: normal; src:url("../fonts/DejaVuSans-Regular.ttf"); } @font-face { font-family: DejaVuSans; font-style: normal; font-weight: bold; src:url("../fonts/DejaVuSans-Bold.ttf"); } @font-face { font-family: DejaVuSans; font-style: italic; font-weight: normal; src:url("../fonts/DejaVuSans-Oblique.ttf"); } @font-face { font-family: DejaVuSans; font-style: italic; font-weight: bold; src:url("../fonts/DejaVuSans-BoldOblique.ttf"); } body { font-family: "DejaVuSans"; } ``` : `--epub-subdirectory=`_DIRNAME_ Specify the subdirectory in the OCF container that is to hold the EPUB-specific contents. The default is `EPUB`. To put the EPUB contents in the top level, use an empty string. : `--ipynb-output=all|none|best` Determines how ipynb output cells are treated. `all` means that all of the data formats included in the original are preserved. `none` means that the contents of data cells are omitted. `best` causes pandoc to try to pick the richest data block in each output cell that is compatible with the output format. The default is `best`. : `--pdf-engine=`_PROGRAM_ Use the specified engine when producing PDF output. Valid values are `pdflatex`, `lualatex`, `xelatex`, `latexmk`, `tectonic`, `wkhtmltopdf`, `weasyprint`, `pagedjs-cli`, `prince`, `context`, `pdfroff`, and `typst`. If the engine is not in your PATH, the full path of the engine may be specified here. If this option is not specified, pandoc uses the following defaults depending on the output format specified using `-t/--to`: - `-t latex` or none: `pdflatex` (other options: `xelatex`, `lualatex`, `tectonic`, `latexmk`) - `-t context`: `context` - `-t html`: `wkhtmltopdf` (other options: `prince`, `weasyprint`, `pagedjs-cli`; see [print-css.rocks](https://print-css.rocks) for a good introduction to PDF generation from HTML/CSS) - `-t ms`: `pdfroff` - `-t typst`: `typst` : `--pdf-engine-opt=`_STRING_ Use the given string as a command-line argument to the `pdf-engine`. For example, to use a persistent directory `foo` for `latexmk`’s auxiliary files, use `--pdf-engine-opt=-outdir=foo`. Note that no check for duplicate options is done. {#citation-rendering .options} ## Citation rendering : `-C`, `--citeproc` Process the citations in the file, replacing them with rendered citations and adding a bibliography. Citation processing will not take place unless bibliographic data is supplied, either through an external file specified using the `--bibliography` option or the `bibliography` field in metadata, or via a `references` section in metadata containing a list of citations in CSL YAML format with Markdown formatting. The style is controlled by a [CSL](https://docs.citationstyles.org/en/stable/specification.html) stylesheet specified using the `--csl` option or the `csl` field in metadata. (If no stylesheet is specified, the `chicago-author-date` style will be used by default.) The citation processing transformation may be applied before or after filters or Lua filters (see `--filter`, `--lua-filter`): these transformations are applied in the order they appear on the command line. For more information, see the section on [Citations](#citations). : `--bibliography=`_FILE_ Set the `bibliography` field in the document’s metadata to _FILE_, overriding any value set in the metadata. If you supply this argument multiple times, each _FILE_ will be added to bibliography. If _FILE_ is a URL, it will be fetched via HTTP. If _FILE_ is not found relative to the working directory, it will be sought in the resource path (see `--resource-path`). : `--csl=`_FILE_ Set the `csl` field in the document’s metadata to _FILE_, overriding any value set in the metadata. (This is equivalent to `--metadata csl=FILE`.) If _FILE_ is a URL, it will be fetched via HTTP. If _FILE_ is not found relative to the working directory, it will be sought in the resource path (see `--resource-path`) and finally in the `csl` subdirectory of the pandoc user data directory. : `--citation-abbreviations=`_FILE_ Set the `citation-abbreviations` field in the document’s metadata to _FILE_, overriding any value set in the metadata. (This is equivalent to `--metadata citation-abbreviations=FILE`.) If _FILE_ is a URL, it will be fetched via HTTP. If _FILE_ is not found relative to the working directory, it will be sought in the resource path (see `--resource-path`) and finally in the `csl` subdirectory of the pandoc user data directory. : `--natbib` Use [`natbib`](https://ctan.org/pkg/natbib) for citations in LaTeX output. This option is not for use with the `--citeproc` option or with PDF output. It is intended for use in producing a LaTeX file that can be processed with [`bibtex`](https://ctan.org/pkg/bibtex). : `--biblatex` Use [`biblatex`](https://ctan.org/pkg/biblatex) for citations in LaTeX output. This option is not for use with the `--citeproc` option or with PDF output. It is intended for use in producing a LaTeX file that can be processed with [`bibtex`](https://ctan.org/pkg/bibtex) or [`biber`](https://ctan.org/pkg/biber). {#math-rendering-in-html .options} ## Math rendering in HTML The default is to render TeX math as far as possible using Unicode characters. Formulas are put inside a `span` with `class="math"`, so that they may be styled differently from the surrounding text if needed. However, this gives acceptable results only for basic math, usually you will want to use `--mathjax` or another of the following options. : `--mathjax`\[`=`_URL_\] Use [MathJax](https://www.mathjax.org) to display embedded TeX math in HTML output. TeX math will be put between `\(...\)` (for inline math) or `\[...\]` (for display math) and wrapped in `` tags with class `math`. Then the MathJax JavaScript will render it. The _URL_ should point to the `MathJax.js` load script. If a _URL_ is not provided, a link to the Cloudflare CDN will be inserted. : `--mathml` Convert TeX math to [MathML](https://www.w3.org/Math/) (in `epub3`, `docbook4`, `docbook5`, `jats`, `html4` and `html5`). This is the default in `odt` output. MathML is supported natively by the main web browsers and select e-book readers. : `--webtex`\[`=`_URL_\] Convert TeX formulas to `` tags that link to an external script that converts formulas to images. The formula will be URL-encoded and concatenated with the URL provided. For SVG images you can for example use `--webtex https://latex.codecogs.com/svg.latex?`. If no URL is specified, the CodeCogs URL generating PNGs will be used (`https://latex.codecogs.com/png.latex?`). Note: the `--webtex` option will affect Markdown output as well as HTML, which is useful if you’re targeting a version of Markdown without native math support. : `--katex`\[`=`_URL_\] Use [KaTeX](https://github.com/Khan/KaTeX) to display embedded TeX math in HTML output. The _URL_ is the base URL for the KaTeX library. That directory should contain a `katex.min.js` and a `katex.min.css` file. If a _URL_ is not provided, a link to the KaTeX CDN will be inserted. : `--gladtex` Enclose TeX math in `` tags in HTML output. The resulting HTML can then be processed by [GladTeX](https://humenda.github.io/GladTeX/) to produce SVG images of the typeset formulas and an HTML file with these images embedded. ``` pandoc -s --gladtex input.md -o myfile.htex gladtex -d image_dir myfile.htex # produces myfile.html and images in image_dir ``` {#options-for-wrapper-scripts .options} ## Options for wrapper scripts : `--dump-args[=true|false]` Print information about command-line arguments to _stdout_, then exit. This option is intended primarily for use in wrapper scripts. The first line of output contains the name of the output file specified with the `-o` option, or `-` (for _stdout_) if no output file was specified. The remaining lines contain the command-line arguments, one per line, in the order they appear. These do not include regular pandoc options and their arguments, but do include any options appearing after a `--` separator at the end of the line. : `--ignore-args[=true|false]` Ignore command-line arguments (for use in wrapper scripts). Regular pandoc options are not ignored. Thus, for example, ``` pandoc --ignore-args -o foo.html -s foo.txt -- -e latin1 ``` is equivalent to ``` pandoc -o foo.html -s ``` {#exit-codes} # Exit codes If pandoc completes successfully, it will return exit code 0. Nonzero exit codes have the following meanings: |Code|Error| |--:|:--| |1|PandocIOError| |3|PandocFailOnWarningError| |4|PandocAppError| |5|PandocTemplateError| |6|PandocOptionError| |21|PandocUnknownReaderError| |22|PandocUnknownWriterError| |23|PandocUnsupportedExtensionError| |24|PandocCiteprocError| |25|PandocBibliographyError| |31|PandocEpubSubdirectoryError| |43|PandocPDFError| |44|PandocXMLError| |47|PandocPDFProgramNotFoundError| |61|PandocHttpError| |62|PandocShouldNeverHappenError| |63|PandocSomeError| |64|PandocParseError| |66|PandocMakePDFError| |67|PandocSyntaxMapError| |83|PandocFilterError| |84|PandocLuaError| |89|PandocNoScriptingEngine| |91|PandocMacroLoop| |92|PandocUTF8DecodingError| |93|PandocIpynbDecodingError| |94|PandocUnsupportedCharsetError| |97|PandocCouldNotFindDataFileError| |98|PandocCouldNotFindMetadataFileError| |99|PandocResourceNotFound| {#defaults-files} # Defaults files The `--defaults` option may be used to specify a package of options, in the form of a YAML file. Fields that are omitted will just have their regular default values. So a defaults file can be as simple as one line: ``` yaml verbosity: INFO ``` In fields that expect a file path (or list of file paths), the following syntax may be used to interpolate environment variables: ``` yaml csl: ${HOME}/mycsldir/special.csl ``` `${USERDATA}` may also be used; this will always resolve to the user data directory that is current when the defaults file is parsed, regardless of the setting of the environment variable `USERDATA`. `${.}` will resolve to the directory containing the defaults file itself. This allows you to refer to resources contained in that directory: ``` yaml epub-cover-image: ${.}/cover.jpg epub-metadata: ${.}/meta.xml resource-path: - . # the working directory from which pandoc is run - ${.}/images # the images subdirectory of the directory # containing this defaults file ``` This environment variable interpolation syntax _only_ works in fields that expect file paths. Defaults files can be placed in the `defaults` subdirectory of the user data directory and used from any directory. For example, one could create a file specifying defaults for writing letters, save it as `letter.yaml` in the `defaults` subdirectory of the user data directory, and then invoke these defaults from any directory using `pandoc --defaults letter` or `pandoc -dletter`. When multiple defaults are used, their contents will be combined. Note that, where command-line arguments may be repeated (`--metadata-file`, `--css`, `--include-in-header`, `--include-before-body`, `--include-after-body`, `--variable`, `--metadata`, `--syntax-definition`), the values specified on the command line will combine with values specified in the defaults file, rather than replacing them. The following tables show the mapping between the command line and defaults file entries. |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| The value of `input-files` may be left empty to indicate input from stdin, and it can be an empty sequence `[]` for no input. {#general-options-1} ## General options |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| Options specified in a defaults file itself always have priority over those in another file included with a `defaults:` entry. `verbosity` can have the values `ERROR`, `WARNING`, or `INFO`. {#reader-options-1} ## Reader options |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| Metadata values specified in a defaults file are parsed as literal string text, not Markdown. Filters will be assumed to be Lua filters if they have the `.lua` extension, and JSON filters otherwise. But the filter type can also be specified explicitly, as shown. Filters are run in the order specified. To include the built-in citeproc filter, use either `citeproc` or `{type: citeproc}`. {#general-writer-options-1} ## General writer options |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| {#options-affecting-specific-writers-1} ## Options affecting specific writers |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| {#citation-rendering-1} ## Citation rendering |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| `cite-method` can be `citeproc`, `natbib`, or `biblatex`. This only affects LaTeX output. If you want to use citeproc to format citations, you should also set 'citeproc: true'. If you need control over when the citeproc processing is done relative to other filters, you should instead use `citeproc` in the list of `filters` (see above). {#math-rendering-in-html-1} ## Math rendering in HTML |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| In addition to the values listed above, `method` can have the value `plain`. If the command line option accepts a URL argument, an `url:` field can be added to `html-math-method:`. {#options-for-wrapper-scripts-1} ## Options for wrapper scripts |command line|defaults file| |:--|:--| |((content omitted))|((content omitted))| |((content omitted))|((content omitted))| {#templates} # Templates When the `-s/--standalone` option is used, pandoc uses a template to add header and footer material that is needed for a self-standing document. To see the default template that is used, just type ``` pandoc -D *FORMAT* ``` where _FORMAT_ is the name of the output format. A custom template can be specified using the `--template` option. You can also override the system default templates for a given output format _FORMAT_ by putting a file `templates/default.*FORMAT*` in the user data directory (see `--data-dir`, above). _Exceptions:_ - For `odt` output, customize the `default.opendocument` template. - For `pdf` output, customize the `default.latex` template (or the `default.context` template, if you use `-t context`, or the `default.ms` template, if you use `-t ms`, or the `default.html` template, if you use `-t html`). - `docx` and `pptx` have no template (however, you can use `--reference-doc` to customize the output). Templates contain _variables_, which allow for the inclusion of arbitrary information at any point in the file. They may be set at the command line using the `-V/--variable` option. If a variable is not set, pandoc will look for the key in the document’s metadata, which can be set using either [YAML metadata blocks](#extension-yaml_metadata_block) or with the `-M/--metadata` option. In addition, some variables are given default values by pandoc. See [Variables](#variables) below for a list of variables used in pandoc’s default templates. If you use custom templates, you may need to revise them as pandoc changes. We recommend tracking the changes in the default templates, and modifying your custom templates accordingly. An easy way to do this is to fork the [pandoc-templates](https://github.com/jgm/pandoc-templates) repository and merge in changes after each pandoc release. {#template-syntax} ## Template syntax {#comments} ### Comments Anything between the sequence `$--` and the end of the line will be treated as a comment and omitted from the output. {#delimiters} ### Delimiters To mark variables and control structures in the template, either `$`…`$` or `${`…`}` may be used as delimiters. The styles may also be mixed in the same template, but the opening and closing delimiter must match in each case. The opening delimiter may be followed by one or more spaces or tabs, which will be ignored. The closing delimiter may be preceded by one or more spaces or tabs, which will be ignored. To include a literal `$` in the document, use `$$`. {#interpolated-variables} ### Interpolated variables A slot for an interpolated variable is a variable name surrounded by matched delimiters. Variable names must begin with a letter and can contain letters, numbers, `_`, `-`, and `.`. The keywords `it`, `if`, `else`, `endif`, `for`, `sep`, and `endfor` may not be used as variable names. Examples: ``` $foo$ $foo.bar.baz$ $foo_bar.baz-bim$ $ foo $ ${foo} ${foo.bar.baz} ${foo_bar.baz-bim} ${ foo } ``` Variable names with periods are used to get at structured variable values. So, for example, `employee.salary` will return the value of the `salary` field of the object that is the value of the `employee` field. - If the value of the variable is a simple value, it will be rendered verbatim. (Note that no escaping is done; the assumption is that the calling program will escape the strings appropriately for the output format.) - If the value is a list, the values will be concatenated. - If the value is a map, the string `true` will be rendered. - Every other value will be rendered as the empty string. {#conditionals} ### Conditionals A conditional begins with `if(variable)` (enclosed in matched delimiters) and ends with `endif` (enclosed in matched delimiters). It may optionally contain an `else` (enclosed in matched delimiters). The `if` section is used if `variable` has a non-empty value, otherwise the `else` section is used (if present). Examples: ``` $if(foo)$bar$endif$ $if(foo)$ $foo$ $endif$ $if(foo)$ part one $else$ part two $endif$ ${if(foo)}bar${endif} ${if(foo)} ${foo} ${endif} ${if(foo)} ${ foo.bar } ${else} no foo! ${endif} ``` The keyword `elseif` may be used to simplify complex nested conditionals: ``` $if(foo)$ XXX $elseif(bar)$ YYY $else$ ZZZ $endif$ ``` {#for-loops} ### For loops A for loop begins with `for(variable)` (enclosed in matched delimiters) and ends with `endfor` (enclosed in matched delimiters). - If `variable` is an array, the material inside the loop will be evaluated repeatedly, with `variable` being set to each value of the array in turn, and concatenated. - If `variable` is a map, the material inside will be set to the map. - If the value of the associated variable is not an array or a map, a single iteration will be performed on its value. Examples: ``` $for(foo)$$foo$$sep$, $endfor$ $for(foo)$ - $foo.last$, $foo.first$ $endfor$ ${ for(foo.bar) } - ${ foo.bar.last }, ${ foo.bar.first } ${ endfor } $for(mymap)$ $it.name$: $it.office$ $endfor$ ``` You may optionally specify a separator between consecutive values using `sep` (enclosed in matched delimiters). The material between `sep` and the `endfor` is the separator. ``` ${ for(foo) }${ foo }${ sep }, ${ endfor } ``` Instead of using `variable` inside the loop, the special anaphoric keyword `it` may be used. ``` ${ for(foo.bar) } - ${ it.last }, ${ it.first } ${ endfor } ``` {#partials} ### Partials Partials (subtemplates stored in different files) may be included by using the name of the partial, followed by `()`, for example: ``` ${ styles() } ``` Partials will be sought in the directory containing the main template. The file name will be assumed to have the same extension as the main template if it lacks an extension. When calling the partial, the full name including file extension can also be used: ``` ${ styles.html() } ``` (If a partial is not found in the directory of the template and the template path is given as a relative path, it will also be sought in the `templates` subdirectory of the user data directory.) Partials may optionally be applied to variables using a colon: ``` ${ date:fancy() } ${ articles:bibentry() } ``` If `articles` is an array, this will iterate over its values, applying the partial `bibentry()` to each one. So the second example above is equivalent to ``` ${ for(articles) } ${ it:bibentry() } ${ endfor } ``` Note that the anaphoric keyword `it` must be used when iterating over partials. In the above examples, the `bibentry` partial should contain `it.title` (and so on) instead of `articles.title`. Final newlines are omitted from included partials. Partials may include other partials. A separator between values of an array may be specified in square brackets, immediately after the variable name or partial: ``` ${months[, ]}$ ${articles:bibentry()[; ]$ ``` The separator in this case is literal and (unlike with `sep` in an explicit `for` loop) cannot contain interpolated variables or other template directives. {#nesting} ### Nesting To ensure that content is "nested," that is, subsequent lines indented, use the `^` directive: ``` $item.number$ $^$$item.description$ ($item.price$) ``` In this example, if `item.description` has multiple lines, they will all be indented to line up with the first line: ``` 00123 A fine bottle of 18-year old Oban whiskey. ($148) ``` To nest multiple lines to the same level, align them with the `^` directive in the template. For example: ``` $item.number$ $^$$item.description$ ($item.price$) (Available til $item.sellby$.) ``` will produce ``` 00123 A fine bottle of 18-year old Oban whiskey. ($148) (Available til March 30, 2020.) ``` If a variable occurs by itself on a line, preceded by whitespace and not followed by further text or directives on the same line, and the variable’s value contains multiple lines, it will be nested automatically. {#breakable-spaces} ### Breakable spaces Normally, spaces in the template itself (as opposed to values of the interpolated variables) are not breakable, but they can be made breakable in part of the template by using the `~` keyword (ended with another `~`). ``` $~$This long line may break if the document is rendered with a short line length.$~$ ``` {#pipes} ### Pipes A pipe transforms the value of a variable or partial. Pipes are specified using a slash (`/`) between the variable name (or partial) and the pipe name. Example: ``` $for(name)$ $name/uppercase$ $endfor$ $for(metadata/pairs)$ - $it.key$: $it.value$ $endfor$ $employee:name()/uppercase$ ``` Pipes may be chained: ``` $for(employees/pairs)$ $it.key/alpha/uppercase$. $it.name$ $endfor$ ``` Some pipes take parameters: ``` |----------------------|------------| $for(employee)$ $it.name.first/uppercase/left 20 "| "$$it.name.salary/right 10 " | " " |"$ $endfor$ |----------------------|------------| ``` Currently the following pipes are predefined: - `pairs`: Converts a map or array to an array of maps, each with `key` and `value` fields. If the original value was an array, the `key` will be the array index, starting with 1. - `uppercase`: Converts text to uppercase. - `lowercase`: Converts text to lowercase. - `length`: Returns the length of the value: number of characters for a textual value, number of elements for a map or array. - `reverse`: Reverses a textual value or array, and has no effect on other values. - `first`: Returns the first value of an array, if applied to a non-empty array; otherwise returns the original value. - `last`: Returns the last value of an array, if applied to a non-empty array; otherwise returns the original value. - `rest`: Returns all but the first value of an array, if applied to a non-empty array; otherwise returns the original value. - `allbutlast`: Returns all but the last value of an array, if applied to a non-empty array; otherwise returns the original value. - `chomp`: Removes trailing newlines (and breakable space). - `nowrap`: Disables line wrapping on breakable spaces. - `alpha`: Converts textual values that can be read as an integer into lowercase alphabetic characters `a..z` (mod 26). This can be used to get lettered enumeration from array indices. To get uppercase letters, chain with `uppercase`. - `roman`: Converts textual values that can be read as an integer into lowercase roman numerals. This can be used to get lettered enumeration from array indices. To get uppercase roman, chain with `uppercase`. - `left n "leftborder" "rightborder"`: Renders a textual value in a block of width `n`, aligned to the left, with an optional left and right border. Has no effect on other values. This can be used to align material in tables. Widths are positive integers indicating the number of characters. Borders are strings inside double quotes; literal `"` and `\` characters must be backslash-escaped. - `right n "leftborder" "rightborder"`: Renders a textual value in a block of width `n`, aligned to the right, and has no effect on other values. - `center n "leftborder" "rightborder"`: Renders a textual value in a block of width `n`, aligned to the center, and has no effect on other values. {#variables} ## Variables {#metadata-variables} ### Metadata variables : `title`, `author`, `date` allow identification of basic aspects of the document. Included in PDF metadata through LaTeX and ConTeXt. These can be set through a [pandoc title block](#extension-pandoc_title_block), which allows for multiple authors, or through a [YAML metadata block](#extension-yaml_metadata_block): ``` --- author: - Aristotle - Peter Abelard ... ``` Note that if you just want to set PDF or HTML metadata, without including a title block in the document itself, you can set the `title-meta`, `author-meta`, and `date-meta` variables. (By default these are set automatically, based on `title`, `author`, and `date`.) The page title in HTML is set by `pagetitle`, which is equal to `title` by default. : `subtitle` document subtitle, included in HTML, EPUB, LaTeX, ConTeXt, and docx documents : `abstract` document summary, included in HTML, LaTeX, ConTeXt, AsciiDoc, and docx documents : `abstract-title` title of abstract, currently used only in HTML, EPUB, and docx. This will be set automatically to a localized value, depending on `lang`, but can be manually overridden. : `keywords` list of keywords to be included in HTML, PDF, ODT, pptx, docx and AsciiDoc metadata; repeat as for `author`, above : `subject` document subject, included in ODT, PDF, docx, EPUB, and pptx metadata : `description` document description, included in ODT, docx and pptx metadata. Some applications show this as `Comments` metadata. : `category` document category, included in docx and pptx metadata Additionally, any root-level string metadata, not included in ODT, docx or pptx metadata is added as a _custom property_. The following [YAML](https://yaml.org/spec/1.2/spec.html){title="YAML v1.2 Spec"} metadata block for instance: ``` --- title: 'This is the title' subtitle: "This is the subtitle" author: - Author One - Author Two description: | This is a long description. It consists of two paragraphs ... ``` will include `title`, `author` and `description` as standard document properties and `subtitle` as a custom property when converting to docx, ODT or pptx. {#language-variables} ### Language variables : `lang` identifies the main language of the document using IETF language tags (following the [BCP 47](https://tools.ietf.org/html/bcp47) standard), such as `en` or `en-GB`. The [Language subtag lookup](https://r12a.github.io/app-subtags/) tool can look up or verify these tags. This affects most formats, and controls hyphenation in PDF output when using LaTeX (through [`babel`](https://ctan.org/pkg/babel) and [`polyglossia`](https://ctan.org/pkg/polyglossia)) or ConTeXt. Use native pandoc [Divs and Spans](#divs-and-spans) with the `lang` attribute to switch the language: ``` --- lang: en-GB ... Text in the main document language (British English). ::: {lang=fr-CA} > Cette citation est écrite en français canadien. ::: More text in English. ['Zitat auf Deutsch.']{lang=de} ``` : `dir` the base script direction, either `rtl` (right-to-left) or `ltr` (left-to-right). For bidirectional documents, native pandoc `span`s and `div`s with the `dir` attribute (value `rtl` or `ltr`) can be used to override the base direction in some output formats. This may not always be necessary if the final renderer (e.g. the browser, when generating HTML) supports the [Unicode Bidirectional Algorithm](https://www.w3.org/International/articles/inline-bidi-markup/uba-basics). When using LaTeX for bidirectional documents, only the `xelatex` engine is fully supported (use `--pdf-engine=xelatex`). {#variables-for-html} ### Variables for HTML : `document-css` Enables inclusion of most of the [CSS](https://developer.mozilla.org/en-US/docs/Learn/CSS) in the `styles.html` [partial](#partials) (have a look with `pandoc --print-default-data-file=templates/styles.html`). Unless you use [`--css`](#option--css), this variable is set to `true` by default. You can disable it with e.g. `pandoc -M document-css=false`. : `mainfont` sets the CSS `font-family` property on the `html` element. : `fontsize` sets the base CSS `font-size`, which you’d usually set to e.g. `20px`, but it also accepts `pt` (12pt = 16px in most browsers). : `fontcolor` sets the CSS `color` property on the `html` element. : `linkcolor` sets the CSS `color` property on all links. : `monofont` sets the CSS `font-family` property on `code` elements. : `monobackgroundcolor` sets the CSS `background-color` property on `code` elements and adds extra padding. : `linestretch` sets the CSS `line-height` property on the `html` element, which is preferred to be unitless. : `maxwidth` sets the CSS `max-width` property (default is 32em). : `backgroundcolor` sets the CSS `background-color` property on the `html` element. : `margin-left`, `margin-right`, `margin-top`, `margin-bottom` sets the corresponding CSS `padding` properties on the `body` element. To override or extend some [CSS](https://developer.mozilla.org/en-US/docs/Learn/CSS) for just one document, include for example: ``` --- header-includes: | --- ``` {#variables-for-html-math} ### Variables for HTML math : `classoption` when using [KaTeX](#option--katex), you can render display math equations flush left using [YAML metadata](#layout) or with `-M classoption=fleqn`. {#variables-for-html-slides} ### Variables for HTML slides These affect HTML output when [producing slide shows with pandoc](#slide-shows). : `institute` author affiliations: can be a list when there are multiple authors : `revealjs-url` base URL for reveal.js documents (defaults to `https://unpkg.com/reveal.js@^4/`) : `s5-url` base URL for S5 documents (defaults to `s5/default`) : `slidy-url` base URL for Slidy documents (defaults to `https://www.w3.org/Talks/Tools/Slidy2`) : `slideous-url` base URL for Slideous documents (defaults to `slideous`) : `title-slide-attributes` additional attributes for the title slide of reveal.js slide shows. See [background in reveal.js, beamer, and pptx](#background-in-reveal.js-beamer-and-pptx) for an example. All [reveal.js configuration options](https://revealjs.com/config/) are available as variables. To turn off boolean flags that default to true in reveal.js, use `0`. {#variables-for-beamer-slides} ### Variables for Beamer slides These variables change the appearance of PDF slides using [`beamer`](https://ctan.org/pkg/beamer). : `aspectratio` slide aspect ratio (`43` for 4:3 \[default\], `169` for 16:9, `1610` for 16:10, `149` for 14:9, `141` for 1.41:1, `54` for 5:4, `32` for 3:2) : `beameroption` add extra beamer option with `\setbeameroption{}` : `institute` author affiliations: can be a list when there are multiple authors : `logo` logo image for slides : `navigation` controls navigation symbols (default is `empty` for no navigation symbols; other valid values are `frame`, `vertical`, and `horizontal`) : `section-titles` enables "title pages" for new sections (default is true) : `theme`, `colortheme`, `fonttheme`, `innertheme`, `outertheme` beamer themes : `themeoptions` options for LaTeX beamer themes (a list). : `titlegraphic` image for title slide {#variables-for-powerpoint} ### Variables for PowerPoint These variables control the visual aspects of a slide show that are not easily controlled via templates. : `monofont` font to use for code. {#variables-for-latex} ### Variables for LaTeX Pandoc uses these variables when [creating a PDF](#creating-a-pdf) with a LaTeX engine. {#layout} #### Layout : `block-headings` make `\paragraph` and `\subparagraph` (fourth- and fifth-level headings, or fifth- and sixth-level with book classes) free-standing rather than run-in; requires further formatting to distinguish from `\subsubsection` (third- or fourth-level headings). Instead of using this option, [KOMA-Script](https://ctan.org/pkg/koma-script) can adjust headings more extensively: ``` --- documentclass: scrartcl header-includes: | \RedeclareSectionCommand[ beforeskip=-10pt plus -2pt minus -1pt, afterskip=1sp plus -1sp minus 1sp, font=\normalfont\itshape]{paragraph} \RedeclareSectionCommand[ beforeskip=-10pt plus -2pt minus -1pt, afterskip=1sp plus -1sp minus 1sp, font=\normalfont\scshape, indent=0pt]{subparagraph} ... ``` : `classoption` option for document class, e.g. `oneside`; repeat for multiple options: ``` --- classoption: - twocolumn - landscape ... ``` : `documentclass` document class: usually one of the standard classes, [`article`](https://ctan.org/pkg/article), [`book`](https://ctan.org/pkg/book), and [`report`](https://ctan.org/pkg/report); the [KOMA-Script](https://ctan.org/pkg/koma-script) equivalents, `scrartcl`, `scrbook`, and `scrreprt`, which default to smaller margins; or [`memoir`](https://ctan.org/pkg/memoir) : `geometry` option for [`geometry`](https://ctan.org/pkg/geometry) package, e.g. `margin=1in`; repeat for multiple options: ``` --- geometry: - top=30mm - left=20mm - heightrounded ... ``` : `hyperrefoptions` option for [`hyperref`](https://ctan.org/pkg/hyperref) package, e.g. `linktoc=all`; repeat for multiple options: ``` --- hyperrefoptions: - linktoc=all - pdfwindowui - pdfpagemode=FullScreen ... ``` : `indent` if true, pandoc will use document class settings for indentation (the default LaTeX template otherwise removes indentation and adds space between paragraphs) : `linestretch` adjusts line spacing using the [`setspace`](https://ctan.org/pkg/setspace) package, e.g. `1.25`, `1.5` : `margin-left`, `margin-right`, `margin-top`, `margin-bottom` sets margins if `geometry` is not used (otherwise `geometry` overrides these) : `pagestyle` control `\pagestyle{}`: the default article class supports `plain` (default), `empty` (no running heads or page numbers), and `headings` (section titles in running heads) : `papersize` paper size, e.g. `letter`, `a4` : `secnumdepth` numbering depth for sections (with `--number-sections` option or `numbersections` variable) : `beamerarticle` produce an article from Beamer slides {#fonts} #### Fonts : `fontenc` allows font encoding to be specified through `fontenc` package (with `pdflatex`); default is `T1` (see [LaTeX font encodings guide](https://ctan.org/pkg/encguide)) : `fontfamily` font package for use with `pdflatex`: [TeX Live](https://www.tug.org/texlive/) includes many options, documented in the [LaTeX Font Catalogue](https://tug.org/FontCatalogue/). The default is [Latin Modern](https://ctan.org/pkg/lm). : `fontfamilyoptions` options for package used as `fontfamily`; repeat for multiple options. For example, to use the Libertine font with proportional lowercase (old-style) figures through the [`libertinus`](https://ctan.org/pkg/libertinus) package: ``` --- fontfamily: libertinus fontfamilyoptions: - osf - p ... ``` : `fontsize` font size for body text. The standard classes allow 10pt, 11pt, and 12pt. To use another size, set `documentclass` to one of the [KOMA-Script](https://ctan.org/pkg/koma-script) classes, such as `scrartcl` or `scrbook`. : `mainfont`, `sansfont`, `monofont`, `mathfont`, `CJKmainfont`, `CJKsansfont`, `CJKmonofont` font families for use with `xelatex` or `lualatex`: take the name of any system font, using the [`fontspec`](https://ctan.org/pkg/fontspec) package. `CJKmainfont` uses the [`xecjk`](https://ctan.org/pkg/xecjk) package. : `mainfontoptions`, `sansfontoptions`, `monofontoptions`, `mathfontoptions`, `CJKoptions` options to use with `mainfont`, `sansfont`, `monofont`, `mathfont`, `CJKmainfont` in `xelatex` and `lualatex`. Allow for any choices available through [`fontspec`](https://ctan.org/pkg/fontspec); repeat for multiple options. For example, to use the [TeX Gyre](http://www.gust.org.pl/projects/e-foundry/tex-gyre) version of Palatino with lowercase figures: ``` --- mainfont: TeX Gyre Pagella mainfontoptions: - Numbers=Lowercase - Numbers=Proportional ... ``` : `babelfonts` a map of Babel language names (e.g. `chinese`) to the font to be used with the language: * * * * * babelfonts: chinese-hant: "Noto Serif CJK TC" russian: "Noto Serif" … : `microtypeoptions` options to pass to the microtype package {#links} #### Links : `colorlinks` add color to link text; automatically enabled if any of `linkcolor`, `filecolor`, `citecolor`, `urlcolor`, or `toccolor` are set : `boxlinks` add visible box around links (has no effect if `colorlinks` is set) : `linkcolor`, `filecolor`, `citecolor`, `urlcolor`, `toccolor` color for internal links, external links, citation links, linked URLs, and links in table of contents, respectively: uses options allowed by [`xcolor`](https://ctan.org/pkg/xcolor), including the `dvipsnames`, `svgnames`, and `x11names` lists : `links-as-notes` causes links to be printed as footnotes : `urlstyle` style for URLs (e.g., `tt`, `rm`, `sf`, and, the default, `same`) {#front-matter} #### Front matter : `lof`, `lot` include list of figures, list of tables : `thanks` contents of acknowledgments footnote after document title : `toc` include table of contents (can also be set using `--toc/--table-of-contents`) : `toc-depth` level of section to include in table of contents {#biblatex-bibliographies} #### BibLaTeX Bibliographies These variables function when using BibLaTeX for [citation rendering](#citation-rendering). : `biblatexoptions` list of options for biblatex : `biblio-style` bibliography style, when used with `--natbib` and `--biblatex` : `biblio-title` bibliography title, when used with `--natbib` and `--biblatex` : `bibliography` bibliography to use for resolving references : `natbiboptions` list of options for natbib {#variables-for-context} ### Variables for ConTeXt Pandoc uses these variables when [creating a PDF](#creating-a-pdf) with ConTeXt. : `fontsize` font size for body text (e.g. `10pt`, `12pt`) : `headertext`, `footertext` text to be placed in running header or footer (see [ConTeXt Headers and Footers](https://wiki.contextgarden.net/Headers_and_Footers)); repeat up to four times for different placement : `indenting` controls indentation of paragraphs, e.g. `yes,small,next` (see [ConTeXt Indentation](https://wiki.contextgarden.net/Indentation)); repeat for multiple options : `interlinespace` adjusts line spacing, e.g. `4ex` (using [`setupinterlinespace`](https://wiki.contextgarden.net/Command/setupinterlinespace)); repeat for multiple options : `layout` options for page margins and text arrangement (see [ConTeXt Layout](https://wiki.contextgarden.net/Layout)); repeat for multiple options : `linkcolor`, `contrastcolor` color for links outside and inside a page, e.g. `red`, `blue` (see [ConTeXt Color](https://wiki.contextgarden.net/Color)) : `linkstyle` typeface style for links, e.g. `normal`, `bold`, `slanted`, `boldslanted`, `type`, `cap`, `small` : `lof`, `lot` include list of figures, list of tables : `mainfont`, `sansfont`, `monofont`, `mathfont` font families: take the name of any system font (see [ConTeXt Font Switching](https://wiki.contextgarden.net/Font_Switching)) : `margin-left`, `margin-right`, `margin-top`, `margin-bottom` sets margins, if `layout` is not used (otherwise `layout` overrides these) : `pagenumbering` page number style and location (using [`setuppagenumbering`](https://wiki.contextgarden.net/Command/setuppagenumbering)); repeat for multiple options : `papersize` paper size, e.g. `letter`, `A4`, `landscape` (see [ConTeXt Paper Setup](https://wiki.contextgarden.net/PaperSetup)); repeat for multiple options : `pdfa` adds to the preamble the setup necessary to generate PDF/A of the type specified, e.g. `1a:2005`, `2a`. If no type is specified (i.e. the value is set to True, by e.g. `--metadata=pdfa` or `pdfa: true` in a YAML metadata block), `1b:2005` will be used as default, for reasons of backwards compatibility. Using `--variable=pdfa` without specified value is not supported. To successfully generate PDF/A the required ICC color profiles have to be available and the content and all included files (such as images) have to be standard-conforming. The ICC profiles and output intent may be specified using the variables `pdfaiccprofile` and `pdfaintent`. See also [ConTeXt PDFA](https://wiki.contextgarden.net/PDF/A) for more details. : `pdfaiccprofile` when used in conjunction with `pdfa`, specifies the ICC profile to use in the PDF, e.g. `default.cmyk`. If left unspecified, `sRGB.icc` is used as default. May be repeated to include multiple profiles. Note that the profiles have to be available on the system. They can be obtained from [ConTeXt ICC Profiles](https://wiki.contextgarden.net/PDFX#ICC_profiles). : `pdfaintent` when used in conjunction with `pdfa`, specifies the output intent for the colors, e.g. `ISO coated v2 300\letterpercent\space (ECI)` If left unspecified, `sRGB IEC61966-2.1` is used as default. : `toc` include table of contents (can also be set using `--toc/--table-of-contents`) : `urlstyle` typeface style for links without link text, e.g. `normal`, `bold`, `slanted`, `boldslanted`, `type`, `cap`, `small` : `whitespace` spacing between paragraphs, e.g. `none`, `small` (using [`setupwhitespace`](https://wiki.contextgarden.net/Command/setupwhitespace)) : `includesource` include all source documents as file attachments in the PDF file {#variables-for-wkhtmltopdf} ### Variables for `wkhtmltopdf` Pandoc uses these variables when [creating a PDF](#creating-a-pdf) with [`wkhtmltopdf`](https://wkhtmltopdf.org). The `--css` option also affects the output. : `footer-html`, `header-html` add information to the header and footer : `margin-left`, `margin-right`, `margin-top`, `margin-bottom` set the page margins : `papersize` sets the PDF paper size {#variables-for-man-pages} ### Variables for man pages : `adjusting` adjusts text to left (`l`), right (`r`), center (`c`), or both (`b`) margins : `footer` footer in man pages : `header` header in man pages : `hyphenate` if `true` (the default), hyphenation will be used : `section` section number in man pages {#variables-for-typst} ### Variables for Typst : `margin` A dictionary with the fields defined in the Typst documentation: `x`, `y`, `top`, `bottom`, `left`, `right`. : `papersize` Paper size: `a4`, `us-letter`, etc. : `mainfont` Name of system font to use for the main font. : `fontsize` Font size (e.g., `12pt`). : `section-numbering` Schema to use for numbering sections, e.g. `1.A.1`. : `columns` Number of columns for body text. {#variables-for-ms} ### Variables for ms : `fontfamily` `A` (Avant Garde), `B` (Bookman), `C` (Helvetica), `HN` (Helvetica Narrow), `P` (Palatino), or `T` (Times New Roman). This setting does not affect source code, which is always displayed using monospace Courier. These built-in fonts are limited in their coverage of characters. Additional fonts may be installed using the script [`install-font.sh`](https://www.schaffter.ca/mom/bin/install-font.sh) provided by Peter Schaffter and documented in detail on [his web site](https://www.schaffter.ca/mom/momdoc/appendices.html#steps). : `indent` paragraph indent (e.g. `2m`) : `lineheight` line height (e.g. `12p`) : `pointsize` point size (e.g. `10p`) {#variables-set-automatically} ### Variables set automatically Pandoc sets these variables automatically in response to [options](#options) or document contents; users can also modify them. These vary depending on the output format, and include the following: : `body` body of document : `date-meta` the `date` variable converted to ISO 8601 YYYY-MM-DD, included in all HTML based formats (dzslides, epub, html, html4, html5, revealjs, s5, slideous, slidy). The recognized formats for `date` are: `mm/dd/yyyy`, `mm/dd/yy`, `yyyy-mm-dd` (ISO 8601), `dd MM yyyy` (e.g. either `02 Apr 2018` or `02 April 2018`), `MM dd, yyyy` (e.g. `Apr. 02, 2018` or `April 02, 2018),`yyyy\[mm\[dd\]\]`(e.g.`20180402, `201804` or `2018`). : `header-includes` contents specified by `-H/--include-in-header` (may have multiple values) : `include-before` contents specified by `-B/--include-before-body` (may have multiple values) : `include-after` contents specified by `-A/--include-after-body` (may have multiple values) : `meta-json` JSON representation of all of the document’s metadata. Field values are transformed to the selected output format. : `numbersections` non-null value if `-N/--number-sections` was specified : `sourcefile`, `outputfile` source and destination filenames, as given on the command line. `sourcefile` can also be a list if input comes from multiple files, or empty if input is from stdin. You can use the following snippet in your template to distinguish them: ``` $if(sourcefile)$ $for(sourcefile)$ $sourcefile$ $endfor$ $else$ (stdin) $endif$ ``` Similarly, `outputfile` can be `-` if output goes to the terminal. If you need absolute paths, use e.g. `$curdir$/$sourcefile$`. : `curdir` working directory from which pandoc is run. : `pandoc-version` pandoc version. : `toc` non-null value if `--toc/--table-of-contents` was specified : `toc-title` title of table of contents (works only with EPUB, HTML, revealjs, opendocument, odt, docx, pptx, beamer, LaTeX) {#extensions} # Extensions The behavior of some of the readers and writers can be adjusted by enabling or disabling various extensions. An extension can be enabled by adding `+EXTENSION` to the format name and disabled by adding `-EXTENSION`. For example, `--from markdown_strict+footnotes` is strict Markdown with footnotes enabled, while `--from markdown-footnotes-pipe_tables` is pandoc’s Markdown without footnotes or pipe tables. The markdown reader and writer make by far the most use of extensions. Extensions only used by them are therefore covered in the section [Pandoc’s Markdown](#pandocs-markdown) below (see [Markdown variants](#markdown-variants) for `commonmark` and `gfm`). In the following, extensions that also work for other formats are covered. Note that markdown extensions added to the `ipynb` format affect Markdown cells in Jupyter notebooks (as do command-line options like `--markdown-headings`). {#typography} ## Typography {#extension-smart} #### Extension: `smart` Interpret straight quotes as curly quotes, `---` as em-dashes, `--` as en-dashes, and `...` as ellipses. Nonbreaking spaces are inserted after certain abbreviations, such as "Mr." This extension can be enabled/disabled for the following formats: : input formats `markdown`, `commonmark`, `latex`, `mediawiki`, `org`, `rst`, `twiki`, `html` : output formats `markdown`, `latex`, `context`, `rst` : enabled by default in `markdown`, `latex`, `context` (both input and output) Note: If you are _writing_ Markdown, then the `smart` extension has the reverse effect: what would have been curly quotes comes out straight. In LaTeX, `smart` means to use the standard TeX ligatures for quotation marks (` `` ` and `''` for double quotes, `` ` `` and `'` for single quotes) and dashes (`--` for en-dash and `---` for em-dash). If `smart` is disabled, then in reading LaTeX pandoc will parse these characters literally. In writing LaTeX, enabling `smart` tells pandoc to use the ligatures when possible; if `smart` is disabled pandoc will use unicode quotation mark and dash characters. {#headings-and-sections} ## Headings and sections {#extension-auto_identifiers} #### Extension: `auto_identifiers` A heading without an explicitly specified identifier will be automatically assigned a unique identifier based on the heading text. This extension can be enabled/disabled for the following formats: : input formats `markdown`, `latex`, `rst`, `mediawiki`, `textile` : output formats `markdown`, `muse` : enabled by default in `markdown`, `muse` The default algorithm used to derive the identifier from the heading text is: - Remove all formatting, links, etc. - Remove all footnotes. - Remove all non-alphanumeric characters, except underscores, hyphens, and periods. - Replace all spaces and newlines with hyphens. - Convert all alphabetic characters to lowercase. - Remove everything up to the first letter (identifiers may not begin with a number or punctuation mark). - If nothing is left after this, use the identifier `section`. Thus, for example, |Heading|Identifier| |:--|:--| |`Heading identifiers in HTML`|`heading-identifiers-in-html`| |`Maître d'hôtel`|`maître-dhôtel`| |`*Dogs*?--in *my* house?`|`dogs--in-my-house`| |`[HTML], [S5], or [RTF]?`|`html-s5-or-rtf`| |`3. Applications`|`applications`| |`33`|`section`| These rules should, in most cases, allow one to determine the identifier from the heading text. The exception is when several headings have the same text; in this case, the first will get an identifier as described above; the second will get the same identifier with `-1` appended; the third with `-2`; and so on. (However, a different algorithm is used if `gfm_auto_identifiers` is enabled; see below.) These identifiers are used to provide link targets in the table of contents generated by the `--toc|--table-of-contents` option. They also make it easy to provide links from one section of a document to another. A link to this section, for example, might look like this: ``` See the section on [heading identifiers](#heading-identifiers-in-html-latex-and-context). ``` Note, however, that this method of providing links to sections works only in HTML, LaTeX, and ConTeXt formats. If the `--section-divs` option is specified, then each section will be wrapped in a `section` (or a `div`, if `html4` was specified), and the identifier will be attached to the enclosing `
` (or `
`) tag rather than the heading itself. This allows entire sections to be manipulated using JavaScript or treated differently in CSS. {#extension-ascii_identifiers} #### Extension: `ascii_identifiers` Causes the identifiers produced by `auto_identifiers` to be pure ASCII. Accents are stripped off of accented Latin letters, and non-Latin letters are omitted. {#extension-gfm_auto_identifiers} #### Extension: `gfm_auto_identifiers` Changes the algorithm used by `auto_identifiers` to conform to GitHub’s method. Spaces are converted to dashes (`-`), uppercase characters to lowercase characters, and punctuation characters other than `-` and `_` are removed. Emojis are replaced by their names. {#math-input} ## Math Input The extensions [`tex_math_dollars`](#extension-tex_math_dollars), [`tex_math_single_backslash`](#extension-tex_math_single_backslash), and [`tex_math_double_backslash`](#extension-tex_math_double_backslash) are described in the section about Pandoc’s Markdown. However, they can also be used with HTML input. This is handy for reading web pages formatted using MathJax, for example. {#raw-htmltex} ## Raw HTML/TeX The following extensions are described in more detail in their respective sections of [Pandoc’s Markdown](#pandocs-markdown): - [`raw_html`](#extension-raw_html) allows HTML elements which are not representable in pandoc’s AST to be parsed as raw HTML. By default, this is disabled for HTML input. - [`raw_tex`](#extension-raw_tex) allows raw LaTeX, TeX, and ConTeXt to be included in a document. This extension can be enabled/disabled for the following formats (in addition to `markdown`): : input formats `latex`, `textile`, `html` (environments, `\ref`, and `\eqref` only), `ipynb` : output formats `textile`, `commonmark` Note: as applied to `ipynb`, `raw_html` and `raw_tex` affect not only raw TeX in markdown cells, but data with mime type `text/html` in output cells. Since the `ipynb` reader attempts to preserve the richest possible outputs when several options are given, you will get best results if you disable `raw_html` and `raw_tex` when converting to formats like `docx` which don’t allow raw `html` or `tex`. - [`native_divs`](#extension-native_divs) causes HTML `div` elements to be parsed as native pandoc Div blocks. If you want them to be parsed as raw HTML, use `-f html-native_divs+raw_html`. - [`native_spans`](#extension-native_spans) causes HTML `span` elements to be parsed as native pandoc Span inlines. If you want them to be parsed as raw HTML, use `-f html-native_spans+raw_html`. If you want to drop all `div`s and `span`s when converting HTML to Markdown, you can use `pandoc -f html-native_divs-native_spans -t markdown`. {#literate-haskell-support} ## Literate Haskell support {#extension-literate_haskell} #### Extension: `literate_haskell` Treat the document as literate Haskell source. This extension can be enabled/disabled for the following formats: : input formats `markdown`, `rst`, `latex` : output formats `markdown`, `rst`, `latex`, `html` If you append `+lhs` (or `+literate_haskell`) to one of the formats above, pandoc will treat the document as literate Haskell source. This means that - In Markdown input, "bird track" sections will be parsed as Haskell code rather than block quotations. Text between `\begin{code}` and `\end{code}` will also be treated as Haskell code. For ATX-style headings the character '=' will be used instead of '#'. - In Markdown output, code blocks with classes `haskell` and `literate` will be rendered using bird tracks, and block quotations will be indented one space, so they will not be treated as Haskell code. In addition, headings will be rendered setext-style (with underlines) rather than ATX-style (with '#' characters). (This is because ghc treats '#' characters in column 1 as introducing line numbers.) - In restructured text input, "bird track" sections will be parsed as Haskell code. - In restructured text output, code blocks with class `haskell` will be rendered using bird tracks. - In LaTeX input, text in `code` environments will be parsed as Haskell code. - In LaTeX output, code blocks with class `haskell` will be rendered inside `code` environments. - In HTML output, code blocks with class `haskell` will be rendered with class `literatehaskell` and bird tracks. Examples: ``` pandoc -f markdown+lhs -t html ``` reads literate Haskell source formatted with Markdown conventions and writes ordinary HTML (without bird tracks). ``` pandoc -f markdown+lhs -t html+lhs ``` writes HTML with the Haskell code in bird tracks, so it can be copied and pasted as literate Haskell source. Note that GHC expects the bird tracks in the first column, so indented literate code blocks (e.g. inside an itemized environment) will not be picked up by the Haskell compiler. {#other-extensions} ## Other extensions {#extension-empty_paragraphs} #### Extension: `empty_paragraphs` Allows empty paragraphs. By default empty paragraphs are omitted. This extension can be enabled/disabled for the following formats: : input formats `docx`, `html` : output formats `docx`, `odt`, `opendocument`, `html` {#extension-native_numbering} #### Extension: `native_numbering` Enables native numbering of figures and tables. Enumeration starts at 1. This extension can be enabled/disabled for the following formats: : output formats `odt`, `opendocument`, `docx` {#extension-xrefs_name} #### Extension: `xrefs_name` Links to headings, figures and tables inside the document are substituted with cross-references that will use the name or caption of the referenced item. The original link text is replaced once the generated document is refreshed. This extension can be combined with `xrefs_number` in which case numbers will appear before the name. Text in cross-references is only made consistent with the referenced item once the document has been refreshed. This extension can be enabled/disabled for the following formats: : output formats `odt`, `opendocument` {#extension-xrefs_number} #### Extension: `xrefs_number` Links to headings, figures and tables inside the document are substituted with cross-references that will use the number of the referenced item. The original link text is discarded. This extension can be combined with `xrefs_name` in which case the name or caption numbers will appear after the number. For the `xrefs_number` to be useful heading numbers must be enabled in the generated document, also table and figure captions must be enabled using for example the `native_numbering` extension. Numbers in cross-references are only visible in the final document once it has been refreshed. This extension can be enabled/disabled for the following formats: : output formats `odt`, `opendocument` {#ext-styles} #### Extension: `styles` When converting from docx, read all docx styles as divs (for paragraph styles) and spans (for character styles) regardless of whether pandoc understands the meaning of these styles. This can be used with [docx custom styles](#custom-styles). Disabled by default. : input formats `docx` {#extension-amuse} #### Extension: `amuse` In the `muse` input format, this enables Text::Amuse extensions to Emacs Muse markup. {#extension-raw_markdown} #### Extension: `raw_markdown` In the `ipynb` input format, this causes Markdown cells to be included as raw Markdown blocks (allowing lossless round-tripping) rather than being parsed. Use this only when you are targeting `ipynb` or a markdown-based output format. {#org-citations} #### Extension: `citations` When the `citations` extension is enabled in `org`, org-cite and org-ref style citations will be parsed as native pandoc citations. When `citations` is enabled in `docx`, citations inserted by Zotero or Mendeley or EndNote plugins will be parsed as native pandoc citations. (Otherwise, the formatted citations generated by the bibliographic software will be parsed as regular text.) {#org-fancy-lists} #### Extension: `fancy_lists` Some aspects of [Pandoc’s Markdown fancy lists](#extension-fancy_lists) are also accepted in `org` input, mimicking the option `org-list-allow-alphabetical` in Emacs. As in Org Mode, enabling this extension allows lowercase and uppercase alphabetical markers for ordered lists to be parsed in addition to arabic ones. Note that for Org, this does not include roman numerals or the `#` placeholder that are enabled by the extension in Pandoc’s Markdown. {#extension-element_citations} #### Extension: `element_citations` In the `jats` output formats, this causes reference items to be replaced with `` elements. These elements are not influenced by CSL styles, but all information on the item is included in tags. {#extension-ntb} #### Extension: `ntb` In the `context` output format this enables the use of [Natural Tables (TABLE)](https://wiki.contextgarden.net/TABLE) instead of the default [Extreme Tables (xtables)](https://wiki.contextgarden.net/xtables). Natural tables allow more fine-grained global customization but come at a performance penalty compared to extreme tables. {#extension-tagging} #### Extension: `tagging` Enabling this extension with `context` output will produce markup suitable for the production of tagged PDFs. This includes additional markers for paragraphs and alternative markup for emphasized text. The `emphasis-command` template variable is set if the extension is enabled. {#pandocs-markdown} # Pandoc’s Markdown Pandoc understands an extended and slightly revised version of John Gruber’s [Markdown](https://daringfireball.net/projects/markdown/) syntax. This document explains the syntax, noting differences from original Markdown. Except where noted, these differences can be suppressed by using the `markdown_strict` format instead of `markdown`. Extensions can be enabled or disabled to specify the behavior more granularly. They are described in the following. See also [Extensions](#extensions) above, for extensions that work also on other formats. {#philosophy} ## Philosophy Markdown is designed to be easy to write, and, even more importantly, easy to read: > A Markdown-formatted document should be publishable as-is, as plain > text, without looking like it’s been marked up with tags or formatting > instructions. – [John > Gruber](https://daringfireball.net/projects/markdown/syntax#philosophy) This principle has guided pandoc’s decisions in finding syntax for tables, footnotes, and other extensions. There is, however, one respect in which pandoc’s aims are different from the original aims of Markdown. Whereas Markdown was originally designed with HTML generation in mind, pandoc is designed for multiple output formats. Thus, while pandoc allows the embedding of raw HTML, it discourages it, and provides other, non-HTMLish ways of representing important document elements like definition lists, tables, mathematics, and footnotes. {#paragraphs} ## Paragraphs A paragraph is one or more lines of text followed by one or more blank lines. Newlines are treated as spaces, so you can reflow your paragraphs as you like. If you need a hard line break, put two or more spaces at the end of a line. {#extension-escaped_line_breaks} #### Extension: `escaped_line_breaks` A backslash followed by a newline is also a hard line break. Note: in multiline and grid table cells, this is the only way to create a hard line break, since trailing spaces in the cells are ignored. {#headings} ## Headings There are two kinds of headings: Setext and ATX. {#setext-style-headings} ### Setext-style headings A setext-style heading is a line of text "underlined" with a row of `=` signs (for a level-one heading) or `-` signs (for a level-two heading): ``` A level-one heading =================== A level-two heading ------------------- ``` The heading text can contain inline formatting, such as emphasis (see [Inline formatting](#inline-formatting), below). {#atx-style-headings} ### ATX-style headings An ATX-style heading consists of one to six `#` signs and a line of text, optionally followed by any number of `#` signs. The number of `#` signs at the beginning of the line is the heading level: ``` ## A level-two heading ### A level-three heading ### ``` As with setext-style headings, the heading text can contain formatting: ``` # A level-one heading with a [link](/url) and *emphasis* ``` {#extension-blank_before_header} #### Extension: `blank_before_header` Original Markdown syntax does not require a blank line before a heading. Pandoc does require this (except, of course, at the beginning of the document). The reason for the requirement is that it is all too easy for a `#` to end up at the beginning of a line by accident (perhaps through line wrapping). Consider, for example: ``` I like several of their flavors of ice cream: #22, for example, and #5. ``` {#extension-space_in_atx_header} #### Extension: `space_in_atx_header` Many Markdown implementations do not require a space between the opening `#`s of an ATX heading and the heading text, so that `#5 bolt` and `#hashtag` count as headings. With this extension, pandoc does require the space. {#heading-identifiers} ### Heading identifiers See also the [`auto_identifiers` extension](#extension-auto_identifiers) above. {#extension-header_attributes} #### Extension: `header_attributes` Headings can be assigned attributes using this syntax at the end of the line containing the heading text: ``` {#identifier .class .class key=value key=value} ``` Thus, for example, the following headings will all be assigned the identifier `foo`: ``` # My heading {#foo} ## My heading ## {#foo} My other heading {#foo} --------------- ``` (This syntax is compatible with [PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/).) Note that although this syntax allows assignment of classes and key/value attributes, writers generally don’t use all of this information. Identifiers, classes, and key/value attributes are used in HTML and HTML-based formats such as EPUB and slidy. Identifiers are used for labels and link anchors in the LaTeX, ConTeXt, Textile, Jira markup, and AsciiDoc writers. Headings with the class `unnumbered` will not be numbered, even if `--number-sections` is specified. A single hyphen (`-`) in an attribute context is equivalent to `.unnumbered`, and preferable in non-English documents. So, ``` # My heading {-} ``` is just the same as ``` # My heading {.unnumbered} ``` If the `unlisted` class is present in addition to `unnumbered`, the heading will not be included in a table of contents. (Currently this feature is only implemented for certain formats: those based on LaTeX and HTML, PowerPoint, and RTF.) {#extension-implicit_header_references} #### Extension: `implicit_header_references` Pandoc behaves as if reference links have been defined for each heading. So, to link to a heading ``` # Heading identifiers in HTML ``` you can simply write ``` [Heading identifiers in HTML] ``` or ``` [Heading identifiers in HTML][] ``` or ``` [the section on heading identifiers][heading identifiers in HTML] ``` instead of giving the identifier explicitly: ``` [Heading identifiers in HTML](#heading-identifiers-in-html) ``` If there are multiple headings with identical text, the corresponding reference will link to the first one only, and you will need to use explicit links to link to the others, as described above. Like regular reference links, these references are case-insensitive. Explicit link reference definitions always take priority over implicit heading references. So, in the following example, the link will point to `bar`, not to `#foo`: ``` # Foo [foo]: bar See [foo] ``` {#block-quotations} ## Block quotations Markdown uses email conventions for quoting blocks of text. A block quotation is one or more paragraphs or other block elements (such as lists or headings), with each line preceded by a `>` character and an optional space. (The `>` need not start at the left margin, but it should not be indented more than three spaces.) ``` > This is a block quote. This > paragraph has two lines. > > 1. This is a list inside a block quote. > 2. Second item. ``` A "lazy" form, which requires the `>` character only on the first line of each block, is also allowed: ``` > This is a block quote. This paragraph has two lines. > 1. This is a list inside a block quote. 2. Second item. ``` Among the block elements that can be contained in a block quote are other block quotes. That is, block quotes can be nested: ``` > This is a block quote. > > > A block quote within a block quote. ``` If the `>` character is followed by an optional space, that space will be considered part of the block quote marker and not part of the indentation of the contents. Thus, to put an indented code block in a block quote, you need five spaces after the `>`: ``` > code ``` {#extension-blank_before_blockquote} #### Extension: `blank_before_blockquote` Original Markdown syntax does not require a blank line before a block quote. Pandoc does require this (except, of course, at the beginning of the document). The reason for the requirement is that it is all too easy for a `>` to end up at the beginning of a line by accident (perhaps through line wrapping). So, unless the `markdown_strict` format is used, the following does not produce a nested block quote in pandoc: ``` > This is a block quote. >> Not nested, since `blank_before_blockquote` is enabled by default ``` {#verbatim-code-blocks} ## Verbatim (code) blocks {#indented-code-blocks} ### Indented code blocks A block of text indented four spaces (or one tab) is treated as verbatim text: that is, special characters do not trigger special formatting, and all spaces and line breaks are preserved. For example, ``` if (a > 3) { moveShip(5 * gravity, DOWN); } ``` The initial (four space or one tab) indentation is not considered part of the verbatim text, and is removed in the output. Note: blank lines in the verbatim text need not begin with four spaces. {#fenced-code-blocks} ### Fenced code blocks {#extension-fenced_code_blocks} #### Extension: `fenced_code_blocks` In addition to standard indented code blocks, pandoc supports _fenced_ code blocks. These begin with a row of three or more tildes (`~`) and end with a row of tildes that must be at least as long as the starting row. Everything between these lines is treated as code. No indentation is necessary: ``` ~~~~~~~ if (a > 3) { moveShip(5 * gravity, DOWN); } ~~~~~~~ ``` Like regular code blocks, fenced code blocks must be separated from surrounding text by blank lines. If the code itself contains a row of tildes or backticks, just use a longer row of tildes or backticks at the start and end: ``` ~~~~~~~~~~~~~~~~ ~~~~~~~~~~ code including tildes ~~~~~~~~~~ ~~~~~~~~~~~~~~~~ ``` {#extension-backtick_code_blocks} #### Extension: `backtick_code_blocks` Same as `fenced_code_blocks`, but uses backticks (`` ` ``) instead of tildes (`~`). {#extension-fenced_code_attributes} #### Extension: `fenced_code_attributes` Optionally, you may attach attributes to fenced or backtick code block using this syntax: ``` ~~~~ {#mycode .haskell .numberLines startFrom="100"} qsort [] = [] qsort (x:xs) = qsort (filter (< x) xs) ++ [x] ++ qsort (filter (>= x) xs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` Here `mycode` is an identifier, `haskell` and `numberLines` are classes, and `startFrom` is an attribute with value `100`. Some output formats can use this information to do syntax highlighting. Currently, the only output formats that use this information are HTML, LaTeX, Docx, Ms, and PowerPoint. If highlighting is supported for your output format and language, then the code block above will appear highlighted, with numbered lines. (To see which languages are supported, type `pandoc --list-highlight-languages`.) Otherwise, the code block above will appear as follows: ```
  
  ...
  
``` The `numberLines` (or `number-lines`) class will cause the lines of the code block to be numbered, starting with `1` or the value of the `startFrom` attribute. The `lineAnchors` (or `line-anchors`) class will cause the lines to be clickable anchors in HTML output. A shortcut form can also be used for specifying the language of the code block: ```` ```haskell qsort [] = [] ``` ```` This is equivalent to: ```` ``` {.haskell} qsort [] = [] ``` ```` This shortcut form may be combined with attributes: ```` ```haskell {.numberLines} qsort [] = [] ``` ```` Which is equivalent to: ```` ``` {.haskell .numberLines} qsort [] = [] ``` ```` If the `fenced_code_attributes` extension is disabled, but input contains class attribute(s) for the code block, the first class attribute will be printed after the opening fence as a bare word. To prevent all highlighting, use the `--no-highlight` flag. To set the highlighting style, use `--highlight-style`. For more information on highlighting, see [Syntax highlighting](#syntax-highlighting), below. {#line-blocks} ## Line blocks {#extension-line_blocks} #### Extension: `line_blocks` A line block is a sequence of lines beginning with a vertical bar (`|`) followed by a space. The division into lines will be preserved in the output, as will any leading spaces; otherwise, the lines will be formatted as Markdown. This is useful for verse and addresses: ``` | The limerick packs laughs anatomical | In space that is quite economical. | But the good ones I've seen | So seldom are clean | And the clean ones so seldom are comical | 200 Main St. | Berkeley, CA 94718 ``` The lines can be hard-wrapped if needed, but the continuation line must begin with a space. ``` | The Right Honorable Most Venerable and Righteous Samuel L. Constable, Jr. | 200 Main St. | Berkeley, CA 94718 ``` Inline formatting (such as emphasis) is allowed in the content, but not block-level formatting (such as block quotes or lists). This syntax is borrowed from [reStructuredText](https://docutils.sourceforge.io/docs/ref/rst/introduction.html). {#lists} ## Lists {#bullet-lists} ### Bullet lists A bullet list is a list of bulleted list items. A bulleted list item begins with a bullet (`*`, `+`, or `-`). Here is a simple example: ``` * one * two * three ``` This will produce a "compact" list. If you want a "loose" list, in which each item is formatted as a paragraph, put spaces between the items: ``` * one * two * three ``` The bullets need not be flush with the left margin; they may be indented one, two, or three spaces. The bullet must be followed by whitespace. List items look best if subsequent lines are flush with the first line (after the bullet): ``` * here is my first list item. * and my second. ``` But Markdown also allows a "lazy" format: ``` * here is my first list item. * and my second. ``` {#block-content-in-list-items} ### Block content in list items A list item may contain multiple paragraphs and other block-level content. However, subsequent paragraphs must be preceded by a blank line and indented to line up with the first non-space content after the list marker. ``` * First paragraph. Continued. * Second paragraph. With a code block, which must be indented eight spaces: { code } ``` Exception: if the list marker is followed by an indented code block, which must begin 5 spaces after the list marker, then subsequent paragraphs must begin two columns after the last character of the list marker: ``` * code continuation paragraph ``` List items may include other lists. In this case the preceding blank line is optional. The nested list must be indented to line up with the first non-space character after the list marker of the containing list item. ``` * fruits + apples - macintosh - red delicious + pears + peaches * vegetables + broccoli + chard ``` As noted above, Markdown allows you to write list items "lazily," instead of indenting continuation lines. However, if there are multiple paragraphs or other blocks in a list item, the first line of each must be indented. ``` + A lazy, lazy, list item. + Another one; this looks bad but is legal. Second paragraph of second list item. ``` {#ordered-lists} ### Ordered lists Ordered lists work just like bulleted lists, except that the items begin with enumerators rather than bullets. In original Markdown, enumerators are decimal numbers followed by a period and a space. The numbers themselves are ignored, so there is no difference between this list: ``` 1. one 2. two 3. three ``` and this one: ``` 5. one 7. two 1. three ``` {#extension-fancy_lists} #### Extension: `fancy_lists` Unlike original Markdown, pandoc allows ordered list items to be marked with uppercase and lowercase letters and roman numerals, in addition to Arabic numerals. List markers may be enclosed in parentheses or followed by a single right-parenthesis or period. They must be separated from the text that follows by at least one space, and, if the list marker is a capital letter with a period, by at least two spaces.[^1] The `fancy_lists` extension also allows '`#`' to be used as an ordered list marker in place of a numeral: ``` #. one #. two ``` Note: the '`#`' ordered list marker doesn’t work with `commonmark`. {#extension-startnum} #### Extension: `startnum` Pandoc also pays attention to the type of list marker used, and to the starting number, and both of these are preserved where possible in the output format. Thus, the following yields a list with numbers followed by a single parenthesis, starting with 9, and a sublist with lowercase roman numerals: ``` 9) Ninth 10) Tenth 11) Eleventh i. subone ii. subtwo iii. subthree ``` Pandoc will start a new list each time a different type of list marker is used. So, the following will create three lists: ``` (2) Two (5) Three 1. Four * Five ``` If default list markers are desired, use `#.`: ``` #. one #. two #. three ``` {#extension-task_lists} #### Extension: `task_lists` Pandoc supports task lists, using the syntax of GitHub-Flavored Markdown. ``` - [ ] an unchecked task list item - [x] checked item ``` {#definition-lists} ### Definition lists {#extension-definition_lists} #### Extension: `definition_lists` Pandoc supports definition lists, using the syntax of [PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/) with some extensions.[^2] ``` Term 1 : Definition 1 Term 2 with *inline markup* : Definition 2 { some code, part of Definition 2 } Third paragraph of definition 2. ``` Each term must fit on one line, which may optionally be followed by a blank line, and must be followed by one or more definitions. A definition begins with a colon or tilde, which may be indented one or two spaces. A term may have multiple definitions, and each definition may consist of one or more block elements (paragraph, code block, list, etc.), each indented four spaces or one tab stop. The body of the definition (not including the first line) should be indented four spaces. However, as with other Markdown lists, you can "lazily" omit indentation except at the beginning of a paragraph or other block element: ``` Term 1 : Definition with lazy continuation. Second paragraph of the definition. ``` If you leave space before the definition (as in the example above), the text of the definition will be treated as a paragraph. In some output formats, this will mean greater spacing between term/definition pairs. For a more compact definition list, omit the space before the definition: ``` Term 1 ~ Definition 1 Term 2 ~ Definition 2a ~ Definition 2b ``` Note that space between items in a definition list is required. (A variant that loosens this requirement, but disallows "lazy" hard wrapping, can be activated with the [`compact_definition_lists` extension](#extension-compact_definition_lists).) {#numbered-example-lists} ### Numbered example lists {#extension-example_lists} #### Extension: `example_lists` The special list marker `@` can be used for sequentially numbered examples. The first list item with a `@` marker will be numbered '1', the next '2', and so on, throughout the document. The numbered examples need not occur in a single list; each new list using `@` will take up where the last stopped. So, for example: ``` (@) My first example will be numbered (1). (@) My second example will be numbered (2). Explanation of examples. (@) My third example will be numbered (3). ``` Numbered examples can be labeled and referred to elsewhere in the document: ``` (@good) This is a good example. As (@good) illustrates, ... ``` The label can be any string of alphanumeric characters, underscores, or hyphens. Note: continuation paragraphs in example lists must always be indented four spaces, regardless of the length of the list marker. That is, example lists always behave as if the `four_space_rule` extension is set. This is because example labels tend to be long, and indenting content to the first non-space character after the label would be awkward. {#ending-a-list} ### Ending a list What if you want to put an indented code block after a list? ``` - item one - item two { my code block } ``` Trouble! Here pandoc (like other Markdown implementations) will treat `{ my code block }` as the second paragraph of item two, and not as a code block. To "cut off" the list after item two, you can insert some non-indented content, like an HTML comment, which won’t produce visible output in any format: ``` - item one - item two { my code block } ``` You can use the same trick if you want two consecutive lists instead of one big list: ``` 1. one 2. two 3. three 1. uno 2. dos 3. tres ``` {#horizontal-rules} ## Horizontal rules A line containing a row of three or more `*`, `-`, or `_` characters (optionally separated by spaces) produces a horizontal rule: ``` * * * * --------------- ``` We strongly recommend that horizontal rules be separated from surrounding text by blank lines. If a horizontal rule is not followed by a blank line, pandoc may try to interpret the lines that follow as a YAML metadata block or a table. {#tables} ## Tables Four kinds of tables may be used. The first three kinds presuppose the use of a fixed-width font, such as Courier. The fourth kind can be used with proportionally spaced fonts, as it does not require lining up columns. {#extension-table_captions} #### Extension: `table_captions` A caption may optionally be provided with all 4 kinds of tables (as illustrated in the examples below). A caption is a paragraph beginning with the string `Table:` (or `table:` or just `:`), which will be stripped off. It may appear either before or after the table. {#extension-simple_tables} #### Extension: `simple_tables` Simple tables look like this: ``` Right Left Center Default ------- ------ ---------- ------- 12 12 12 12 123 123 123 123 1 1 1 1 Table: Demonstration of simple table syntax. ``` The header and table rows must each fit on one line. Column alignments are determined by the position of the header text relative to the dashed line below it:[^3] - If the dashed line is flush with the header text on the right side but extends beyond it on the left, the column is right-aligned. - If the dashed line is flush with the header text on the left side but extends beyond it on the right, the column is left-aligned. - If the dashed line extends beyond the header text on both sides, the column is centered. - If the dashed line is flush with the header text on both sides, the default alignment is used (in most cases, this will be left). The table must end with a blank line, or a line of dashes followed by a blank line. The column header row may be omitted, provided a dashed line is used to end the table. For example: ``` ------- ------ ---------- ------- 12 12 12 12 123 123 123 123 1 1 1 1 ------- ------ ---------- ------- ``` When the header row is omitted, column alignments are determined on the basis of the first line of the table body. So, in the tables above, the columns would be right, left, center, and right aligned, respectively. {#extension-multiline_tables} #### Extension: `multiline_tables` Multiline tables allow header and table rows to span multiple lines of text (but cells that span multiple columns or rows of the table are not supported). Here is an example: ``` ------------------------------------------------------------- Centered Default Right Left Header Aligned Aligned Aligned ----------- ------- --------------- ------------------------- First row 12.0 Example of a row that spans multiple lines. Second row 5.0 Here's another one. Note the blank line between rows. ------------------------------------------------------------- Table: Here's the caption. It, too, may span multiple lines. ``` These work like simple tables, but with the following differences: - They must begin with a row of dashes, before the header text (unless the header row is omitted). - They must end with a row of dashes, then a blank line. - The rows must be separated by blank lines. In multiline tables, the table parser pays attention to the widths of the columns, and the writers try to reproduce these relative widths in the output. So, if you find that one of the columns is too narrow in the output, try widening it in the Markdown source. The header may be omitted in multiline tables as well as simple tables: ``` ----------- ------- --------------- ------------------------- First row 12.0 Example of a row that spans multiple lines. Second row 5.0 Here's another one. Note the blank line between rows. ----------- ------- --------------- ------------------------- : Here's a multiline table without a header. ``` It is possible for a multiline table to have just one row, but the row should be followed by a blank line (and then the row of dashes that ends the table), or the table may be interpreted as a simple table. {#extension-grid_tables} #### Extension: `grid_tables` Grid tables look like this: ``` : Sample grid table. +---------------+---------------+--------------------+ | Fruit | Price | Advantages | +===============+===============+====================+ | Bananas | $1.34 | - built-in wrapper | | | | - bright color | +---------------+---------------+--------------------+ | Oranges | $2.10 | - cures scurvy | | | | - tasty | +---------------+---------------+--------------------+ ``` The row of `=`s separates the header from the table body, and can be omitted for a headerless table. The cells of grid tables may contain arbitrary block elements (multiple paragraphs, code blocks, lists, etc.). Cells can span multiple columns or rows: ``` +---------------------+----------+ | Property | Earth | +=============+=======+==========+ | | min | -89.2 °C | | Temperature +-------+----------+ | 1961-1990 | mean | 14 °C | | +-------+----------+ | | max | 56.7 °C | +-------------+-------+----------+ ``` A table header may contain more than one row: ``` +---------------------+-----------------------+ | Location | Temperature 1961-1990 | | | in degree Celsius | | +-------+-------+-------+ | | min | mean | max | +=====================+=======+=======+=======+ | Antarctica | -89.2 | N/A | 19.8 | +---------------------+-------+-------+-------+ | Earth | -89.2 | 14 | 56.7 | +---------------------+-------+-------+-------+ ``` Alignments can be specified as with pipe tables, by putting colons at the boundaries of the separator line after the header: ``` +---------------+---------------+--------------------+ | Right | Left | Centered | +==============:+:==============+:==================:+ | Bananas | $1.34 | built-in wrapper | +---------------+---------------+--------------------+ ``` For headerless tables, the colons go on the top line instead: ``` +--------------:+:--------------+:------------------:+ | Right | Left | Centered | +---------------+---------------+--------------------+ ``` A table foot can be defined by enclosing it with separator lines that use `=` instead of `-`: ``` +---------------+---------------+ | Fruit | Price | +===============+===============+ | Bananas | $1.34 | +---------------+---------------+ | Oranges | $2.10 | +===============+===============+ | Sum | $3.44 | +===============+===============+ ``` The foot must always be placed at the very bottom of the table. Grid tables can be created easily using Emacs’ table-mode (`M-x table-insert`). {#extension-pipe_tables} #### Extension: `pipe_tables` Pipe tables look like this: ``` | Right | Left | Default | Center | |------:|:-----|---------|:------:| | 12 | 12 | 12 | 12 | | 123 | 123 | 123 | 123 | | 1 | 1 | 1 | 1 | : Demonstration of pipe table syntax. ``` The syntax is identical to [PHP Markdown Extra tables](https://michelf.ca/projects/php-markdown/extra/#table). The beginning and ending pipe characters are optional, but pipes are required between all columns. The colons indicate column alignment as shown. The header cannot be omitted. To simulate a headerless table, include a header with blank cells. Since the pipes indicate column boundaries, columns need not be vertically aligned, as they are in the above example. So, this is a perfectly legal (though ugly) pipe table: ``` fruit| price -----|-----: apple|2.05 pear|1.37 orange|3.09 ``` The cells of pipe tables cannot contain block elements like paragraphs and lists, and cannot span multiple lines. If any line of the markdown source is longer than the column width (see `--columns`), then the table will take up the full text width and the cell contents will wrap, with the relative cell widths determined by the number of dashes in the line separating the table header from the table body. (For example `---|-` would make the first column 3/4 and the second column 1/4 of the full text width.) On the other hand, if no lines are wider than column width, then cell contents will not be wrapped, and the cells will be sized to their contents. Note: pandoc also recognizes pipe tables of the following form, as can be produced by Emacs’ orgtbl-mode: ``` | One | Two | |-----+-------| | my | table | | is | nice | ``` The difference is that `+` is used instead of `|`. Other orgtbl features are not supported. In particular, to get non-default column alignment, you’ll need to add colons as above. {#metadata-blocks} ## Metadata blocks {#extension-pandoc_title_block} #### Extension: `pandoc_title_block` If the file begins with a title block ``` % title % author(s) (separated by semicolons) % date ``` it will be parsed as bibliographic information, not regular text. (It will be used, for example, in the title of standalone LaTeX or HTML output.) The block may contain just a title, a title and an author, or all three elements. If you want to include an author but no title, or a title and a date but no author, you need a blank line: ``` % % Author ``` ``` % My title % % June 15, 2006 ``` The title may occupy multiple lines, but continuation lines must begin with leading space, thus: ``` % My title on multiple lines ``` If a document has multiple authors, the authors may be put on separate lines with leading space, or separated by semicolons, or both. So, all of the following are equivalent: ``` % Author One Author Two ``` ``` % Author One; Author Two ``` ``` % Author One; Author Two ``` The date must fit on one line. All three metadata fields may contain standard inline formatting (italics, links, footnotes, etc.). Title blocks will always be parsed, but they will affect the output only when the `--standalone` (`-s`) option is chosen. In HTML output, titles will appear twice: once in the document head – this is the title that will appear at the top of the window in a browser – and once at the beginning of the document body. The title in the document head can have an optional prefix attached (`--title-prefix` or `-T` option). The title in the body appears as an H1 element with class "title", so it can be suppressed or reformatted with CSS. If a title prefix is specified with `-T` and no title block appears in the document, the title prefix will be used by itself as the HTML title. The man page writer extracts a title, man page section number, and other header and footer information from the title line. The title is assumed to be the first word on the title line, which may optionally end with a (single-digit) section number in parentheses. (There should be no space between the title and the parentheses.) Anything after this is assumed to be additional footer and header text. A single pipe character (`|`) should be used to separate the footer text from the header text. Thus, ``` % PANDOC(1) ``` will yield a man page with the title `PANDOC` and section 1. ``` % PANDOC(1) Pandoc User Manuals ``` will also have "Pandoc User Manuals" in the footer. ``` % PANDOC(1) Pandoc User Manuals | Version 4.0 ``` will also have "Version 4.0" in the header. {#extension-yaml_metadata_block} #### Extension: `yaml_metadata_block` A [YAML](https://yaml.org/spec/1.2/spec.html){title="YAML v1.2 Spec"} metadata block is a valid YAML object, delimited by a line of three hyphens (`---`) at the top and a line of three hyphens (`---`) or three dots (`...`) at the bottom. The initial line `---` must not be followed by a blank line. A YAML metadata block may occur anywhere in the document, but if it is not at the beginning, it must be preceded by a blank line. Note that, because of the way pandoc concatenates input files when several are provided, you may also keep the metadata in a separate YAML file and pass it to pandoc as an argument, along with your Markdown files: ``` pandoc chap1.md chap2.md chap3.md metadata.yaml -s -o book.html ``` Just be sure that the YAML file begins with `---` and ends with `---` or `...`. Alternatively, you can use the `--metadata-file` option. Using that approach however, you cannot reference content (like footnotes) from the main markdown input document. Metadata will be taken from the fields of the YAML object and added to any existing document metadata. Metadata can contain lists and objects (nested arbitrarily), but all string scalars will be interpreted as Markdown. Fields with names ending in an underscore will be ignored by pandoc. (They may be given a role by external processors.) Field names must not be interpretable as YAML numbers or boolean values (so, for example, `yes`, `True`, and `15` cannot be used as field names). A document may contain multiple metadata blocks. If two metadata blocks attempt to set the same field, the value from the second block will be taken. Each metadata block is handled internally as an independent YAML document. This means, for example, that any YAML anchors defined in a block cannot be referenced in another block. When pandoc is used with `-t markdown` to create a Markdown document, a YAML metadata block will be produced only if the `-s/--standalone` option is used. All of the metadata will appear in a single block at the beginning of the document. Note that [YAML](https://yaml.org/spec/1.2/spec.html){title="YAML v1.2 Spec"} escaping rules must be followed. Thus, for example, if a title contains a colon, it must be quoted, and if it contains a backslash escape, then it must be ensured that it is not treated as a [YAML escape sequence](https://yaml.org/spec/1.2/spec.html#id2776092). The pipe character (`|`) can be used to begin an indented block that will be interpreted literally, without need for escaping. This form is necessary when the field contains blank lines or block-level formatting: ``` --- title: 'This is the title: it contains a colon' author: - Author One - Author Two keywords: [nothing, nothingness] abstract: | This is the abstract. It consists of two paragraphs. ... ``` The literal block after the `|` must be indented relative to the line containing the `|`. If it is not, the YAML will be invalid and pandoc will not interpret it as metadata. For an overview of the complex rules governing YAML, see the [Wikipedia entry on YAML syntax](https://en.wikipedia.org/wiki/YAML#Syntax). Template variables will be set automatically from the metadata. Thus, for example, in writing HTML, the variable `abstract` will be set to the HTML equivalent of the Markdown in the `abstract` field: ```

This is the abstract.

It consists of two paragraphs.

``` Variables can contain arbitrary YAML structures, but the template must match this structure. The `author` variable in the default templates expects a simple list or string, but can be changed to support more complicated structures. The following combination, for example, would add an affiliation to the author if one is given: ``` --- title: The document title author: - name: Author One affiliation: University of Somewhere - name: Author Two affiliation: University of Nowhere ... ``` To use the structured authors in the example above, you would need a custom template: ``` $for(author)$ $if(author.name)$ $author.name$$if(author.affiliation)$ ($author.affiliation$)$endif$ $else$ $author$ $endif$ $endfor$ ``` Raw content to include in the document’s header may be specified using `header-includes`; however, it is important to mark up this content as raw code for a particular output format, using the [`raw_attribute` extension](#extension-raw_attribute), or it will be interpreted as markdown. For example: ```` header-includes: - | ```{=latex} \let\oldsection\section \renewcommand{\section}[1]{\clearpage\oldsection{#1}} ``` ```` Note: the `yaml_metadata_block` extension works with `commonmark` as well as `markdown` (and it is enabled by default in `gfm` and `commonmark_x`). However, in these formats the following restrictions apply: - The YAML metadata block must occur at the beginning of the document (and there can be only one). If multiple files are given as arguments to pandoc, only the first can be a YAML metadata block. - The leaf nodes of the YAML structure are parsed in isolation from each other and from the rest of the document. So, for example, you can’t use a reference link in these contexts if the link definition is somewhere else in the document. {#backslash-escapes} ## Backslash escapes {#extension-all_symbols_escapable} #### Extension: `all_symbols_escapable` Except inside a code block or inline code, any punctuation or space character preceded by a backslash will be treated literally, even if it would normally indicate formatting. Thus, for example, if one writes ``` *\*hello\** ``` one will get ``` *hello* ``` instead of ``` hello ``` This rule is easier to remember than original Markdown’s rule, which allows only the following characters to be backslash-escaped: ``` \`*_{}[]()>#+-.! ``` (However, if the `markdown_strict` format is used, the original Markdown rule will be used.) A backslash-escaped space is parsed as a nonbreaking space. In TeX output, it will appear as `~`. In HTML and XML output, it will appear as a literal unicode nonbreaking space character (note that it will thus actually look "invisible" in the generated HTML source; you can still use the `--ascii` command-line option to make it appear as an explicit entity). A backslash-escaped newline (i.e. a backslash occurring at the end of a line) is parsed as a hard line break. It will appear in TeX output as `\\` and in HTML as `
`. This is a nice alternative to Markdown’s "invisible" way of indicating hard line breaks using two trailing spaces on a line. Backslash escapes do not work in verbatim contexts. {#inline-formatting} ## Inline formatting {#emphasis} ### Emphasis To _emphasize_ some text, surround it with `*`s or `_`, like this: ``` This text is _emphasized with underscores_, and this is *emphasized with asterisks*. ``` Double `*` or `_` produces *strong emphasis*: ``` This is **strong emphasis** and __with underscores__. ``` A `*` or `_` character surrounded by spaces, or backslash-escaped, will not trigger emphasis: ``` This is * not emphasized *, and \*neither is this\*. ``` {#extension-intraword_underscores} #### Extension: `intraword_underscores` Because `_` is sometimes used inside words and identifiers, pandoc does not interpret a `_` surrounded by alphanumeric characters as an emphasis marker. If you want to emphasize just part of a word, use `*`: ``` feas*ible*, not feas*able*. ``` {#strikeout} ### Strikeout {#extension-strikeout} #### Extension: `strikeout` To strike out a section of text with a horizontal line, begin and end it with `~~`. Thus, for example, ``` This ~~is deleted text.~~ ``` {#superscripts-and-subscripts} ### Superscripts and subscripts {#extension-superscript-subscript} #### Extension: `superscript`, `subscript` Superscripts may be written by surrounding the superscripted text by `^` characters; subscripts may be written by surrounding the subscripted text by `~` characters. Thus, for example, ``` H~2~O is a liquid. 2^10^ is 1024. ``` The text between `^...^` or `~...~` may not contain spaces or newlines. If the superscripted or subscripted text contains spaces, these spaces must be escaped with backslashes. (This is to prevent accidental superscripting and subscripting through the ordinary use of `~` and `^`, and also bad interactions with footnotes.) Thus, if you want the letter P with 'a cat' in subscripts, use `P~a\ cat~`, not `P~a cat~`. {#verbatim} ### Verbatim To make a short span of text verbatim, put it inside backticks: ``` What is the difference between `>>=` and `>>`? ``` If the verbatim text includes a backtick, use double backticks: ``` Here is a literal backtick `` ` ``. ``` (The spaces after the opening backticks and before the closing backticks will be ignored.) The general rule is that a verbatim span starts with a string of consecutive backticks (optionally followed by a space) and ends with a string of the same number of backticks (optionally preceded by a space). Note that backslash-escapes (and other Markdown constructs) do not work in verbatim contexts: ``` This is a backslash followed by an asterisk: `\*`. ``` {#extension-inline_code_attributes} #### Extension: `inline_code_attributes` Attributes can be attached to verbatim text, just as with [fenced code blocks](#fenced-code-blocks): ``` `<$>`{.haskell} ``` {#underline} ### Underline To underline text, use the `underline` class: ``` [Underline]{.underline} ``` Or, without the `bracketed_spans` extension (but with `native_spans`): ``` Underline ``` This will work in all output formats that support underline. {#small-caps} ### Small caps To write small caps, use the `smallcaps` class: ``` [Small caps]{.smallcaps} ``` Or, without the `bracketed_spans` extension: ``` Small caps ``` For compatibility with other Markdown flavors, CSS is also supported: ``` Small caps ``` This will work in all output formats that support small caps. {#highlighting} ### Highlighting To highlight text, use the `mark` class: ``` [Mark]{.mark} ``` Or, without the `bracketed_spans` extension (but with `native_spans`): ``` Mark ``` This will work in all output formats that support highlighting. {#math} ## Math {#extension-tex_math_dollars} #### Extension: `tex_math_dollars` Anything between two `$` characters will be treated as TeX math. The opening `$` must have a non-space character immediately to its right, while the closing `$` must have a non-space character immediately to its left, and must not be followed immediately by a digit. Thus, `$20,000 and $30,000` won’t parse as math. If for some reason you need to enclose text in literal `$` characters, backslash-escape them and they won’t be treated as math delimiters. For display math, use `$$` delimiters. (In this case, the delimiters may be separated from the formula by whitespace. However, there can be no blank lines between the opening and closing `$$` delimiters.) TeX math will be printed in all output formats. How it is rendered depends on the output format: : LaTeX It will appear verbatim surrounded by `\(...\)` (for inline math) or `\[...\]` (for display math). : Markdown, Emacs Org mode, ConTeXt, ZimWiki It will appear verbatim surrounded by `$...$` (for inline math) or `$$...$$` (for display math). : XWiki It will appear verbatim surrounded by `{{formula}}..{{/formula}}`. : reStructuredText It will be rendered using an [interpreted text role `:math:`](https://docutils.sourceforge.io/docs/ref/rst/roles.html#math). : AsciiDoc For AsciiDoc output math will appear verbatim surrounded by `latexmath:[...]`. For `asciidoc_legacy` the bracketed material will also include inline or display math delimiters. : Texinfo It will be rendered inside a `@math` command. : roff man, Jira markup It will be rendered verbatim without `$`’s. : MediaWiki, DokuWiki It will be rendered inside `` tags. : Textile It will be rendered inside `` tags. : RTF, OpenDocument It will be rendered, if possible, using Unicode characters, and will otherwise appear verbatim. : ODT It will be rendered, if possible, using MathML. : DocBook If the `--mathml` flag is used, it will be rendered using MathML in an `inlineequation` or `informalequation` tag. Otherwise it will be rendered, if possible, using Unicode characters. : Docx and PowerPoint It will be rendered using OMML math markup. : FictionBook2 If the `--webtex` option is used, formulas are rendered as images using CodeCogs or other compatible web service, downloaded and embedded in the e-book. Otherwise, they will appear verbatim. : HTML, Slidy, DZSlides, S5, EPUB The way math is rendered in HTML will depend on the command-line options selected. Therefore see [Math rendering in HTML](#math-rendering-in-html) above. {#raw-html} ## Raw HTML {#extension-raw_html} #### Extension: `raw_html` Markdown allows you to insert raw HTML (or DocBook) anywhere in a document (except verbatim contexts, where `<`, `>`, and `&` are interpreted literally). (Technically this is not an extension, since standard Markdown allows it, but it has been made an extension so that it can be disabled if desired.) The raw HTML is passed through unchanged in HTML, S5, Slidy, Slideous, DZSlides, EPUB, Markdown, CommonMark, Emacs Org mode, and Textile output, and suppressed in other formats. For a more explicit way of including raw HTML in a Markdown document, see the [`raw_attribute` extension](#extension-raw_attribute). In the CommonMark format, if `raw_html` is enabled, superscripts, subscripts, strikeouts and small capitals will be represented as HTML. Otherwise, plain-text fallbacks will be used. Note that even if `raw_html` is disabled, tables will be rendered with HTML syntax if they cannot use pipe syntax. {#extension-markdown_in_html_blocks} #### Extension: `markdown_in_html_blocks` Original Markdown allows you to include HTML "blocks": blocks of HTML between balanced tags that are separated from the surrounding text with blank lines, and start and end at the left margin. Within these blocks, everything is interpreted as HTML, not Markdown; so (for example), `*` does not signify emphasis. Pandoc behaves this way when the `markdown_strict` format is used; but by default, pandoc interprets material between HTML block tags as Markdown. Thus, for example, pandoc will turn ```
*one* [a link](https://google.com)
``` into ```
one a link
``` whereas `Markdown.pl` will preserve it as is. There is one exception to this rule: text between ` HTML """) ``` ## Image This image ![image](myimage.png) will be included as a cell attachment. ```` If you want to add cell attributes, group cells differently, or add output to code cells, then you need to include divs to indicate the structure. You can use either [fenced divs](#extension-fenced_divs) or [native divs](#extension-native_divs) for this. Here is an example: ```` :::::: {.cell .markdown} # Lorem **Lorem ipsum** dolor sit amet, consectetur adipiscing elit. Nunc luctus bibendum felis dictum sodales. :::::: :::::: {.cell .code execution_count=1} ``` {.python} print("hello") ``` ::: {.output .stream .stdout} ``` hello ``` ::: :::::: :::::: {.cell .code execution_count=2} ``` {.python} from IPython.display import HTML HTML(""" HTML """) ``` ::: {.output .execute_result execution_count=2} ```{=html} HTML hello ``` ::: :::::: ```` If you include raw HTML or TeX in an output cell, use the [raw attribute](#extension-raw_attribute), as shown in the last cell of the example above. Although pandoc can process "bare" raw HTML and TeX, the result is often interspersed raw elements and normal textual elements, and in an output cell pandoc expects a single, connected raw block. To avoid using raw HTML or TeX except when marked explicitly using raw attributes, we recommend specifying the extensions `-raw_html-raw_tex+raw_attribute` when translating between Markdown and ipynb notebooks. Note that options and extensions that affect reading and writing of Markdown will also affect Markdown cells in ipynb notebooks. For example, `--wrap=preserve` will preserve soft line breaks in Markdown cells; `--markdown-headings=setext` will cause Setext-style headings to be used; and `--preserve-tabs` will prevent tabs from being turned to spaces. {#syntax-highlighting} # Syntax highlighting Pandoc will automatically highlight syntax in [fenced code blocks](#fenced-code-blocks) that are marked with a language name. The Haskell library [skylighting](https://github.com/jgm/skylighting) is used for highlighting. Currently highlighting is supported only for HTML, EPUB, Docx, Ms, and LaTeX/PDF output. To see a list of language names that pandoc will recognize, type `pandoc --list-highlight-languages`. The color scheme can be selected using the `--highlight-style` option. The default color scheme is `pygments`, which imitates the default color scheme used by the Python library pygments (though pygments is not actually used to do the highlighting). To see a list of highlight styles, type `pandoc --list-highlight-styles`. If you are not satisfied with the predefined styles, you can use `--print-highlight-style` to generate a JSON `.theme` file which can be modified and used as the argument to `--highlight-style`. To get a JSON version of the `pygments` style, for example: ``` pandoc --print-highlight-style pygments > my.theme ``` Then edit `my.theme` and use it like this: ``` pandoc --highlight-style my.theme ``` If you are not satisfied with the built-in highlighting, or you want to highlight a language that isn’t supported, you can use the `--syntax-definition` option to load a [KDE-style XML syntax definition file](https://docs.kde.org/stable5/en/kate/katepart/highlight.html). Before writing your own, have a look at KDE’s [repository of syntax definitions](https://github.com/KDE/syntax-highlighting/tree/master/data/syntax). To disable highlighting, use the `--no-highlight` option. {#custom-styles} # Custom Styles Custom styles can be used in the docx and ICML formats. {#output} ## Output By default, pandoc’s docx and ICML output applies a predefined set of styles for blocks such as paragraphs and block quotes, and uses largely default formatting (italics, bold) for inlines. This will work for most purposes, especially alongside a `reference.docx` file. However, if you need to apply your own styles to blocks, or match a preexisting set of styles, pandoc allows you to define custom styles for blocks and text using `div`s and `span`s, respectively. If you define a `div` or `span` with the attribute `custom-style`, pandoc will apply your specified style to the contained elements (with the exception of elements whose function depends on a style, like headings, code blocks, block quotes, or links). So, for example, using the `bracketed_spans` syntax, ``` [Get out]{custom-style="Emphatically"}, he said. ``` would produce a docx file with "Get out" styled with character style `Emphatically`. Similarly, using the `fenced_divs` syntax, ``` Dickinson starts the poem simply: ::: {custom-style="Poetry"} | A Bird came down the Walk--- | He did not know I saw--- ::: ``` would style the two contained lines with the `Poetry` paragraph style. For docx output, styles will be defined in the output file as inheriting from normal text, if the styles are not yet in your reference.docx. If they are already defined, pandoc will not alter the definition. This feature allows for greatest customization in conjunction with [pandoc filters](https://pandoc.org/filters.html). If you want all paragraphs after block quotes to be indented, you can write a filter to apply the styles necessary. If you want all italics to be transformed to the `Emphasis` character style (perhaps to change their color), you can write a filter which will transform all italicized inlines to inlines within an `Emphasis` custom-style `span`. For docx output, you don’t need to enable any extensions for custom styles to work. {#input} ## Input The docx reader, by default, only reads those styles that it can convert into pandoc elements, either by direct conversion or interpreting the derivation of the input document’s styles. By enabling the [`styles` extension](#ext-styles) in the docx reader (`-f docx+styles`), you can produce output that maintains the styles of the input document, using the `custom-style` class. Paragraph styles are interpreted as divs, while character styles are interpreted as spans. For example, using the `custom-style-reference.docx` file in the test directory, we have the following different outputs: Without the `+styles` extension: ``` $ pandoc test/docx/custom-style-reference.docx -f docx -t markdown This is some text. This is text with an *emphasized* text style. And this is text with a **strengthened** text style. > Here is a styled paragraph that inherits from Block Text. ``` And with the extension: ``` $ pandoc test/docx/custom-style-reference.docx -f docx+styles -t markdown ::: {custom-style="First Paragraph"} This is some text. ::: ::: {custom-style="Body Text"} This is text with an [emphasized]{custom-style="Emphatic"} text style. And this is text with a [strengthened]{custom-style="Strengthened"} text style. ::: ::: {custom-style="My Block Style"} > Here is a styled paragraph that inherits from Block Text. ::: ``` With these custom styles, you can use your input document as a reference-doc while creating docx output (see below), and maintain the same styles in your input and output files. {#custom-readers-and-writers} # Custom readers and writers Pandoc can be extended with custom readers and writers written in [Lua](https://www.lua.org). (Pandoc includes a Lua interpreter, so Lua need not be installed separately.) To use a custom reader or writer, simply specify the path to the Lua script in place of the input or output format. For example: ``` pandoc -t data/sample.lua pandoc -f my_custom_markup_language.lua -t latex -s ``` If the script is not found relative to the working directory, it will be sought in the `custom` subdirectory of the user data directory (see `--data-dir`). A custom reader is a Lua script that defines one function, Reader, which takes a string as input and returns a Pandoc AST. See the [Lua filters documentation](https://pandoc.org/lua-filters.html) for documentation of the functions that are available for creating pandoc AST elements. For parsing, the [lpeg](http://www.inf.puc-rio.br/~roberto/lpeg/) parsing library is available by default. To see a sample custom reader: ``` pandoc --print-default-data-file creole.lua ``` If you want your custom reader to have access to reader options (e.g. the tab stop setting), you give your Reader function a second `options` parameter. A custom writer is a Lua script that defines a function that specifies how to render each element in a Pandoc AST. See the [djot-writer.lua](https://github.com/jgm/djot.lua/blob/main/djot-writer.lua) for a full-featured example. Note that custom writers have no default template. If you want to use `--standalone` with a custom writer, you will need to specify a template manually using `--template` or add a new default template with the name `default.NAME_OF_CUSTOM_WRITER.lua` to the `templates` subdirectory of your user data directory (see [Templates](#templates)). {#reproducible-builds} # Reproducible builds Some of the document formats pandoc targets (such as EPUB, docx, and ODT) include build timestamps in the generated document. That means that the files generated on successive builds will differ, even if the source does not. To avoid this, set the `SOURCE_DATE_EPOCH` environment variable, and the timestamp will be taken from it instead of the current time. `SOURCE_DATE_EPOCH` should contain an integer unix timestamp (specifying the number of seconds since midnight UTC January 1, 1970). Some document formats also include a unique identifier. For EPUB, this can be set explicitly by setting the `identifier` metadata field (see [EPUB Metadata](#epub-metadata), above). {#accessible-pdfs-and-pdf-archiving-standards} # Accessible PDFs and PDF archiving standards PDF is a flexible format, and using PDF in certain contexts requires additional conventions. For example, PDFs are not accessible by default; they define how characters are placed on a page but do not contain semantic information on the content. However, it is possible to generate accessible PDFs, which use tagging to add semantic information to the document. Pandoc defaults to LaTeX to generate PDF. Tagging support in LaTeX is in development and not readily available, so PDFs generated in this way will always be untagged and not accessible. This means that alternative engines must be used to generate accessible PDFs. The PDF standards PDF/A and PDF/UA define further restrictions intended to optimize PDFs for archiving and accessibility. Tagging is commonly used in combination with these standards to ensure best results. Note, however, that standard compliance depends on many things, including the colorspace of embedded images. Pandoc cannot check this, and external programs must be used to ensure that generated PDFs are in compliance. {#context} ## ConTeXt ConTeXt always produces tagged PDFs, but the quality depends on the input. The default ConTeXt markup generated by pandoc is optimized for readability and reuse, not tagging. Enable the [`tagging`](#extension--tagging) format extension to force markup that is optimized for tagging. This can be combined with the `pdfa` variable to generate standard-compliant PDFs. E.g.: ``` pandoc --to=context+tagging -V pdfa=3a ``` A recent `context` version should be used, as older versions contained a bug that lead to invalid PDF metadata. {#weasyprint} ## WeasyPrint The HTML-based engine WeasyPrint includes experimental support for PDF/A and PDF/UA since version 57. Tagged PDFs can created with ``` pandoc --pdf-engine=weasyprint \ --pdf-engine-opt=--pdf-variant=pdf/ua-1 ... ``` The feature is experimental and standard compliance should not be assumed. {#prince-xml} ## Prince XML The non-free HTML-to-PDf converter `prince` has extensive support for various PDF standards as well as tagging. E.g.: ``` pandoc --pdf-engine=prince \ --pdf-engine-opt=--tagged-pdf ... ``` See the prince documentation for more info. {#word-processors} ## Word Processors Word processors like LibreOffice and MS Word can also be used to generate standardized and tagged PDF output. Pandoc does not support direct conversions via these tools. However, pandoc can convert a document to a `docx` or `odt` file, which can then be opened and converted to PDF with the respective word processor. See the documentation for [Word](https://support.microsoft.com/en-us/office/create-accessible-pdfs-064625e0-56ea-4e16-ad71-3aa33bb4b7ed) and [LibreOffice](https://help.libreoffice.org/7.1/en-US/text/shared/01/ref_pdf_export_general.html). {#running-pandoc-as-a-web-server} # Running pandoc as a web server If you rename (or symlink) the pandoc executable to `pandoc-server`, or if you call pandoc with `server` as the first argument, it will start up a web server with a JSON API. This server exposes most of the conversion functionality of pandoc. For full documentation, see the [pandoc-server](https://github.com/jgm/pandoc/blob/master/doc/pandoc-server.md) man page. If you rename (or symlink) the pandoc executable to `pandoc-server.cgi`, it will function as a CGI program exposing the same API as `pandoc-server`. `pandoc-server` is designed to be maximally secure; it uses Haskell’s type system to provide strong guarantees that no I/O will be performed on the server during pandoc conversions. {#running-pandoc-as-a-lua-interpreter} # Running pandoc as a Lua interpreter Calling the pandoc executable under the name `pandoc-lua` or with `lua` as the first argument will make it function as a standalone Lua interpreter. The behavior is mostly identical to that of the [standalone `lua` executable](https://www.lua.org/manual/5.4/manual.html#7), version 5.4. However, there is no REPL yet, and the `-i` option has no effect. For full documentation, see the [pandoc-lua](https://github.com/jgm/pandoc/blob/master/doc/pandoc-lua.md) man page. {#a-note-on-security} # A note on security 1. Although pandoc itself will not create or modify any files other than those you explicitly ask it create (with the exception of temporary files used in producing PDFs), a filter or custom writer could in principle do anything on your file system. Please audit filters and custom writers very carefully before using them. 2. Several input formats (including HTML, Org, and RST) support `include` directives that allow the contents of a file to be included in the output. An untrusted attacker could use these to view the contents of files on the file system. (Using the `--sandbox` option can protect against this threat.) 3. Several output formats (including RTF, FB2, HTML with `--self-contained`, EPUB, Docx, and ODT) will embed encoded or raw images into the output file. An untrusted attacker could exploit this to view the contents of non-image files on the file system. (Using the `--sandbox` option can protect against this threat, but will also prevent including images in these formats.) 4. If your application uses pandoc as a Haskell library (rather than shelling out to the executable), it is possible to use it in a mode that fully isolates pandoc from your file system, by running the pandoc operations in the `PandocPure` monad. See the document [Using the pandoc API](https://pandoc.org/using-the-pandoc-api.html) for more details. (This corresponds to the use of the `--sandbox` option on the command line.) 5. Pandoc’s parsers can exhibit pathological performance on some corner cases. It is wise to put any pandoc operations under a timeout, to avoid DOS attacks that exploit these issues. If you are using the pandoc executable, you can add the command line options `+RTS -M512M -RTS` (for example) to limit the heap size to 512MB. Note that the `commonmark` parser (including `commonmark_x` and `gfm`) is much less vulnerable to pathological performance than the `markdown` parser, so it is a better choice when processing untrusted input. 6. The HTML generated by pandoc is not guaranteed to be safe. If `raw_html` is enabled for the Markdown input, users can inject arbitrary HTML. Even if `raw_html` is disabled, users can include dangerous content in URLs and attributes. To be safe, you should run all HTML generated from untrusted user input through an HTML sanitizer. {#authors} # Authors Copyright 2006–2022 John MacFarlane (jgm@berkeley.edu). Released under the [GPL](https://www.gnu.org/copyleft/gpl.html){title="GNU General Public License"}, version 2 or greater. This software carries no warranty of any kind. (See COPYRIGHT for full copyright and warranty notices.) For a full list of contributors, see the file AUTHORS.md in the pandoc source code. [^1]: The point of this rule is to ensure that normal paragraphs starting with people’s initials, like ``` B. Russell won a Nobel Prize (but not for "On Denoting"). ``` do not get treated as list items. This rule will not prevent ``` (C) 2007 Joe Smith ``` from being interpreted as a list item. In this case, a backslash escape can be used: ``` (C\) 2007 Joe Smith ``` [^2]: I have been influenced by the suggestions of [David Wheeler](https://justatheory.com/2009/02/modest-markdown-proposal/). [^3]: This scheme is due to Michel Fortin, who proposed it on the [Markdown discussion list](http://six.pairlist.net/pipermail/markdown-discuss/2005-March/001097.html). [^4]: To see why laziness is incompatible with relaxing the requirement of a blank line between items, consider the following example: ``` bar : definition foo : definition ``` Is this a single list item with two definitions of "bar," the first of which is lazily wrapped, or two list items? To remove the ambiguity we must either disallow lazy wrapping or require a blank line between list items. djot-0.1.2.4/djot.cabal0000644000000000000000000000514607346545000012737 0ustar0000000000000000cabal-version: 3.0 name: djot version: 0.1.2.4 synopsis: Parser and renderer for djot light markup syntax. description: Djot () is a light markup language. This package provides a data structure to represent djot documents, a very fast parser, and functions to render a parsed document as HTML and as djot. license: MIT license-file: LICENSE author: John MacFarlane maintainer: jgm@berkeley.edu copyright: Copyright (C) 2024 John MacFarlane category: Text build-type: Simple extra-doc-files: CHANGELOG.md extra-source-files: test/*.test benchmark/m.dj Source-repository head type: git location: https://github.com/jgm/djoths.git common deps build-depends: base >= 4.12 && < 5, bytestring >= 0.11.3, doclayout Library import: deps build-depends: mtl, containers >= 0.6.6, text, template-haskell hs-source-dirs: src default-language: Haskell2010 exposed-modules: Djot Djot.AST Djot.Parse Djot.Options Djot.Attributes Djot.Inlines Djot.Blocks Djot.Html Djot.Djot ghc-options: -Wall -O2 executable djoths import: deps main-is: Main.hs build-depends: djot, text hs-source-dirs: app default-language: Haskell2010 ghc-options: -Wall -O2 -rtsopts -threaded test-suite test-djot import: deps type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: test ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-K40K -with-rtsopts=-kc40K if impl(ghc >= 8.10) ghc-options: -Wunused-packages build-depends: djot, directory, filepath, tasty, tasty-hunit, tasty-quickcheck, text default-language: Haskell2010 benchmark benchmark-djot import: deps type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: benchmark build-depends: djot, directory, filepath, tasty-bench ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-K10K -with-rtsopts=-kc10K if impl(ghc >= 8.10) ghc-options: -Wunused-packages default-language: Haskell2010 djot-0.1.2.4/src/0000755000000000000000000000000007346545000011574 5ustar0000000000000000djot-0.1.2.4/src/Djot.hs0000644000000000000000000000056107346545000013032 0ustar0000000000000000module Djot ( parseDoc , renderHtml , renderDjot , toIdentifier , ParseOptions(..) , SourcePosOption(..) , RenderOptions(..) , module Djot.AST ) where import Djot.Options (ParseOptions(..), RenderOptions(..), SourcePosOption(..)) import Djot.Blocks (parseDoc, toIdentifier) import Djot.Html (renderHtml) import Djot.Djot (renderDjot) import Djot.AST djot-0.1.2.4/src/Djot/0000755000000000000000000000000007346545000012474 5ustar0000000000000000djot-0.1.2.4/src/Djot/AST.hs0000644000000000000000000003073207346545000013464 0ustar0000000000000000{-# LANGUAGE Strict #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveLift #-} module Djot.AST ( Inline(..), Many(..), Inlines, MathStyle(..), Format(..), Node(Node), Pos(..), addAttr, addPos, Block(..), Blocks, Doc(..), NoteMap(..), insertNote, lookupNote, ReferenceMap(..), insertReference, lookupReference, normalizeLabel, Attr(..), Target(..), TaskStatus(..), Align(..), Cell(..), CellType(..), Caption(..), ListSpacing(..), OrderedListAttributes(..), OrderedListDelim(..), OrderedListStyle(..), QuoteType(..), delete, displayMath, insert, emailLink, emph, footnoteReference, hardBreak, highlight, image, inlineMath, link, nonBreakingSpace, rawInline, softBreak, span_, str, strong, subscript, superscript, singleQuoted, doubleQuoted, symbol, verbatim, urlLink, para, section, heading, blockQuote, codeBlock, div, bulletList, orderedList, definitionList, taskList, thematicBreak, table, rawBlock, inlinesToByteString ) where import Prelude hiding (div) import Data.ByteString (ByteString) import Data.Sequence (Seq) import qualified Data.Sequence as Seq import qualified Data.Map.Strict as M import Data.Set (Set) import Data.Data (Data, Typeable) import qualified Data.ByteString.Char8 as B8 import GHC.Generics (Generic) import Language.Haskell.TH.Syntax (Lift (..)) -- import Debug.Trace newtype Attr = Attr [(ByteString, ByteString)] deriving (Show, Eq, Ord, Typeable, Data, Generic, Lift) instance Semigroup Attr where Attr as <> Attr bs = Attr $ foldr integrate bs as instance Monoid Attr where mappend = (<>) mempty = Attr mempty integrate :: (ByteString, ByteString) -> [(ByteString, ByteString)] -> [(ByteString, ByteString)] integrate (k,v) kvs = case lookup k kvs of Nothing -> (k,v) : kvs Just v' | k == "class" -> (k, v <> " " <> v') : filter (\(k',_) -> k' /= "class") kvs | otherwise -> kvs data Pos = NoPos | Pos Int Int Int Int -- start line, start col, end line, end col deriving (Show, Eq, Ord, Typeable, Data, Generic, Lift) instance Semigroup Pos where Pos sl1 sc1 _ _ <> Pos _ _ el2 ec2 = Pos sl1 sc1 el2 ec2 NoPos <> _ = NoPos _ <> NoPos = NoPos instance Monoid Pos where mappend = (<>) mempty = NoPos data Node a = Node Pos Attr a deriving (Show, Eq, Ord, Functor, Traversable, Foldable, Typeable, Data, Generic, Lift) {-# INLINE addAttr #-} addAttr :: Attr -> Node a -> Node a addAttr attr (Node pos attr' bs) = Node pos (attr' <> attr) bs {-# INLINE addPos #-} addPos :: Pos -> Node a -> Node a addPos pos (Node _ attr bs) = Node pos attr bs newtype Format = Format { unFormat :: ByteString } deriving (Show, Eq, Ord, Typeable, Data, Generic, Lift) data MathStyle = DisplayMath | InlineMath deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data Target = Direct ByteString | Reference ByteString deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data QuoteType = SingleQuotes | DoubleQuotes deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data Inline = Str ByteString | Emph Inlines | Strong Inlines | Highlight Inlines | Insert Inlines | Delete Inlines | Superscript Inlines | Subscript Inlines | Verbatim ByteString | Symbol ByteString | Math MathStyle ByteString | Link Inlines Target | Image Inlines Target | Span Inlines | FootnoteReference ByteString | UrlLink ByteString | EmailLink ByteString | RawInline Format ByteString | NonBreakingSpace | Quoted QuoteType Inlines | SoftBreak | HardBreak deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) newtype Many a = Many { unMany :: Seq a } deriving (Show, Ord, Eq, Functor, Traversable, Foldable, Typeable, Data, Generic, Lift) type Inlines = Many (Node Inline) instance Semigroup Inlines where Many as <> Many bs = case (Seq.viewr as, Seq.viewl bs) of (as' Seq.:> Node pos1 attr (Str s), Node pos2 attr' (Str t) Seq.:< bs') | attr == mempty && attr' /= mempty , (sa, sb) <- B8.spanEnd (not . isSpaceOrTab) s , not (B8.null sb) -> if B8.null sa then Many (as' <> (Node (pos1 <> pos2) attr' (Str (s <> t)) Seq.<| bs')) else let sblen = B8.length (B8.filter (\c -> c < '\128' || c >= '\192') sb) (pos1', pos2') = case pos1 <> pos2 of NoPos -> (NoPos, NoPos) Pos sl sc el ec -> (Pos sl sc el (ec - sblen), Pos sl (sc + sblen + 1) el ec) in Many ((as' Seq.|> Node pos1' mempty (Str sa) Seq.|> Node pos2' attr (Str (sb <> t))) <> bs') | attr == attr' -> Many (as' <> (Node (pos1 <> pos2) attr (Str (s <> t)) Seq.<| bs')) (as' Seq.:> Node pos attr (Str s), Node _ _ HardBreak Seq.:< _) | B8.all isSpaceOrTab (B8.takeEnd 1 s) -> Many (as' <> (Node pos attr (Str (B8.dropWhileEnd isSpaceOrTab s)) Seq.<| bs)) _ -> Many (as <> bs) where isSpaceOrTab ' ' = True isSpaceOrTab '\t' = True isSpaceOrTab _ = False instance Monoid Inlines where mappend = (<>) mempty = Many mempty data ListSpacing = Tight | Loose deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data OrderedListStyle = Decimal | LetterUpper | LetterLower | RomanUpper | RomanLower deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data OrderedListDelim = RightPeriod | RightParen | LeftRightParen deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data OrderedListAttributes = OrderedListAttributes { orderedListStyle :: OrderedListStyle , orderedListDelim :: OrderedListDelim , orderedListStart :: Int } deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data TaskStatus = Complete | Incomplete deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) newtype Caption = Caption Blocks deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data Align = AlignLeft | AlignRight | AlignCenter | AlignDefault deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data CellType = HeadCell | BodyCell deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data Cell = Cell CellType Align Inlines deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) data Block = Para Inlines | Section Blocks | Heading Int Inlines | BlockQuote Blocks | CodeBlock ByteString ByteString | Div Blocks | OrderedList OrderedListAttributes ListSpacing [Blocks] | BulletList ListSpacing [Blocks] | TaskList ListSpacing [(TaskStatus, Blocks)] | DefinitionList ListSpacing [(Inlines, Blocks)] | ThematicBreak | Table (Maybe Caption) [[Cell]] | RawBlock Format ByteString deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) type Blocks = Many (Node Block) instance Semigroup Blocks where Many as <> Many bs = Many (as <> bs) instance Monoid Blocks where mappend = (<>) mempty = Many mempty data Doc = Doc{ docBlocks :: Blocks , docFootnotes :: NoteMap , docReferences :: ReferenceMap , docAutoReferences :: ReferenceMap , docAutoIdentifiers :: Set ByteString } deriving (Show, Ord, Eq, Typeable, Data, Generic, Lift) instance Semigroup Doc where Doc bs ns rs ar ai <> Doc bs' ns' rs' ar' ai' = Doc (bs <> bs') (ns <> ns') (rs <> rs') (ar <> ar') (ai <> ai') instance Monoid Doc where mappend = (<>) mempty = Doc mempty mempty mempty mempty mempty -- | A map from labels to contents. newtype NoteMap = NoteMap { unNoteMap :: M.Map ByteString Blocks } deriving (Show, Ord, Eq, Semigroup, Monoid, Typeable, Data, Generic, Lift) insertNote :: ByteString -> Blocks -> NoteMap -> NoteMap insertNote label ref (NoteMap m) = NoteMap (M.insert (normalizeLabel label) ref m) lookupNote :: ByteString -> NoteMap -> Maybe Blocks lookupNote label (NoteMap m) = M.lookup (normalizeLabel label) m newtype ReferenceMap = ReferenceMap { unReferenceMap :: M.Map ByteString (ByteString, Attr) } deriving (Show, Ord, Eq, Semigroup, Monoid, Typeable, Data, Generic, Lift) normalizeLabel :: ByteString -> ByteString normalizeLabel = B8.unwords . B8.splitWith isWs where isWs c = c == ' ' || c == '\t' || c == '\r' || c == '\n' insertReference :: ByteString -> (ByteString, Attr) -> ReferenceMap -> ReferenceMap insertReference label ref (ReferenceMap rm) = ReferenceMap (M.insert (normalizeLabel label) ref rm) lookupReference :: ByteString -> ReferenceMap -> Maybe (ByteString, Attr) lookupReference label (ReferenceMap rm) = M.lookup (normalizeLabel label) rm {-# INLINE inline #-} inline :: Inline -> Inlines inline = Many . Seq.singleton . Node NoPos mempty str, verbatim, symbol :: ByteString -> Inlines str = inline . Str verbatim = inline . Verbatim symbol = inline . Symbol emph, strong, superscript, subscript :: Inlines -> Inlines emph = inline . Emph strong = inline . Strong superscript = inline . Superscript subscript = inline . Subscript highlight, insert, delete :: Inlines -> Inlines highlight = inline . Highlight insert = inline . Insert delete = inline . Delete link, image :: Inlines -> Target -> Inlines link ils url = inline $ Link ils url image ils url = inline $ Image ils url span_ :: Inlines -> Inlines span_ = inline . Span softBreak, hardBreak, nonBreakingSpace :: Inlines softBreak = inline SoftBreak hardBreak = inline HardBreak nonBreakingSpace = inline NonBreakingSpace inlineMath, displayMath :: ByteString -> Inlines inlineMath = inline . Math InlineMath displayMath = inline . Math DisplayMath singleQuoted, doubleQuoted :: Inlines -> Inlines singleQuoted = inline . Quoted SingleQuotes doubleQuoted = inline . Quoted DoubleQuotes footnoteReference :: ByteString -> Inlines footnoteReference = inline . FootnoteReference urlLink, emailLink :: ByteString -> Inlines urlLink = inline . UrlLink emailLink = inline . EmailLink rawInline :: Format -> ByteString -> Inlines rawInline f = inline . RawInline f -- block :: Block -> Blocks block = Many . Seq.singleton . Node NoPos mempty para :: Inlines -> Blocks para = block . Para section :: Blocks -> Blocks section = block . Section heading :: Int -> Inlines -> Blocks heading lev = block . Heading lev blockQuote :: Blocks -> Blocks blockQuote = block . BlockQuote codeBlock :: ByteString -> ByteString -> Blocks codeBlock lang bs = block $ CodeBlock lang bs bulletList :: ListSpacing -> [Blocks] -> Blocks bulletList tightness = block . BulletList tightness orderedList :: OrderedListAttributes -> ListSpacing -> [Blocks] -> Blocks orderedList attr tightness = block . OrderedList attr tightness definitionList :: ListSpacing -> [(Inlines, Blocks)] -> Blocks definitionList tightness = block . DefinitionList tightness taskList :: ListSpacing -> [(TaskStatus, Blocks)] -> Blocks taskList tightness = block . TaskList tightness div :: Blocks -> Blocks div = block . Div thematicBreak :: Blocks thematicBreak = block ThematicBreak table :: Maybe Caption -> [[Cell]] -> Blocks table mbCaption = block . Table mbCaption rawBlock :: Format -> ByteString -> Blocks rawBlock f = block . RawBlock f inlinesToByteString :: Inlines -> ByteString inlinesToByteString = foldMap go . unMany where go (Node _pos _attr x) = case x of Str bs -> bs Emph ils -> inlinesToByteString ils Strong ils -> inlinesToByteString ils Highlight ils -> inlinesToByteString ils Insert ils -> inlinesToByteString ils Delete ils -> inlinesToByteString ils Superscript ils -> inlinesToByteString ils Subscript ils -> inlinesToByteString ils Quoted SingleQuotes ils -> "\x2018" <> inlinesToByteString ils <> "\x2019" Quoted DoubleQuotes ils -> "\x201C" <> inlinesToByteString ils <> "\x201D" Verbatim bs -> bs Math DisplayMath bs -> "$$" <> bs <> "$$" Math InlineMath bs -> "$" <> bs <> "$" Symbol bs -> ":" <> bs <> ":" Link ils _url -> inlinesToByteString ils Image ils _url -> inlinesToByteString ils Span ils -> inlinesToByteString ils UrlLink url -> url EmailLink email -> email RawInline _ _ -> mempty FootnoteReference bs -> "[" <> bs <> "]" SoftBreak -> "\n" HardBreak -> "\n" NonBreakingSpace -> "\160" djot-0.1.2.4/src/Djot/Attributes.hs0000644000000000000000000001465507346545000015171 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Strict #-} {-# LANGUAGE DeriveDataTypeable #-} module Djot.Attributes ( pAttributes , parseAttributes , AttrParserState -- opaque , AttrParseResult(..) ) where import Data.Char (isAlphaNum, isPunctuation) import Djot.AST (Attr(..)) import Djot.Parse import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as B8 import Data.ByteString.Char8 ( (!?) ) import Data.Typeable (Typeable) import Data.Maybe (fromMaybe) -- import Debug.Trace -- attributes { id = "foo", class = "bar baz", -- key1 = "val1", key2 = "val2" } -- syntax: -- -- attributes <- '{' (ignorable attribute)* ignorable* '}' -- attribute <- identifier | class | keyval -- identifier <- '#' name -- class <- '.' name -- name <- (nonspace, nonpunctuation other than ':', '_', '-')+ -- keyval <- key '=' val -- key <- (ASCII_ALPHANUM | ':' | '_' | '-')+ -- val <- bareval | quotedval -- bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+ -- quotedval <- '"' ([^"] | '\"') '"' -- ignorable <- whitespace | comment -- comment <- '%' [^%}]* '%' pAttributes :: Parser s Attr pAttributes = lookahead (asciiChar '{') >> getSlice >>= go Nothing where getSlice = byteStringOf $ branch (skipSome (skipSatisfyByte (/= '}'))) (optional_ (asciiChar '}')) (asciiChar '}') go mbst bs = do case parseAttributes mbst bs of Done (attr, _off) -> pure attr Partial st -> getSlice >>= go (Just st) Failed _off -> failed data AttrParseResult = Done (Attr, Int) -- result and byte offset | Failed Int -- byte offset of failure | Partial AttrParserState -- entire bytestring consumed deriving (Show, Typeable) data AttrParserState = AttrParserState { aState :: AState , subject :: ByteString , offset :: Int , parts :: [AttrPart] } deriving (Show, Typeable) data AState = SCANNING | AFTER_KEY | SCANNING_VALUE | SCANNING_QUOTED_VALUE | SCANNING_ESCAPE | SCANNING_COMMENT | FAIL | DONE | START deriving (Eq, Ord, Show, Typeable) data AttrPart = AttrId ByteString | AttrClass ByteString | AttrKey ByteString | AttrValue ByteString deriving (Eq, Ord, Show, Typeable) -- | Resumable parser, returning parts in reverse order. parseAttributes :: Maybe AttrParserState -> ByteString -> AttrParseResult parseAttributes mbState bs = case go (fromMaybe AttrParserState{ aState = START , subject = bs , offset = 0 , parts = [] } mbState) of AttrParserState{ aState = DONE, parts = attparts, offset = off } -> Done (attrPartsToAttr attparts, off) AttrParserState{ aState = FAIL, offset = off } -> Failed off st -> Partial st where go :: AttrParserState -> AttrParserState go st@(AttrParserState _ subj off _) = -- trace (show st) $ case subj !? off of Nothing -> st Just nextc -> case aState st of SCANNING -> case nextc of '}' -> go st{ aState = DONE, offset = off + 1 } '%' -> go st{ aState = SCANNING_COMMENT, offset = off + 1 } '#' -> go $ takePart isNameChar AttrId SCANNING st{ offset = off + 1 } '.' -> go $ takePart isNameChar AttrClass SCANNING st{ offset = off + 1 } c | isKeyChar c -> go $ takePart isKeyChar AttrKey AFTER_KEY st c | isWs c -> go $ skipWhile isWs st _ -> st{ aState = FAIL } AFTER_KEY -> case nextc of '=' -> go st{ aState = SCANNING_VALUE, offset = off + 1 } _ -> st{ aState = FAIL } SCANNING_VALUE -> case nextc of '"' -> go st{ aState = SCANNING_QUOTED_VALUE, offset = off + 1 } c | isBareValChar c -> go $ takePart isBareValChar AttrValue SCANNING st _ -> st{ aState = FAIL } SCANNING_QUOTED_VALUE -> case nextc of '"' -> go st{ aState = SCANNING, offset = off + 1 } '\\' -> go st{ aState = SCANNING_ESCAPE, offset = off + 1 } c | isWs c -> let st' = skipWhile isWs st in go st'{ parts = AttrValue " " : parts st' } _ -> go $ takePart (\c -> not (isWs c || c == '"' || c == '\\')) AttrValue SCANNING_QUOTED_VALUE st SCANNING_ESCAPE -> go st{ aState = SCANNING_QUOTED_VALUE, offset = off + 1, parts = AttrValue (B8.singleton nextc) : parts st } SCANNING_COMMENT -> case nextc of '%' -> go st{ aState = SCANNING, offset = off + 1 } '}' -> st{ aState = DONE, offset = off + 1 } _ -> go $ skipWhile (\c -> not (c == '%' || c == '}')) st FAIL -> st DONE -> st START -> case nextc of '{' -> go st{ aState = SCANNING, offset = off + 1 } _ -> st{ aState = FAIL } takePart :: (Char -> Bool) -> (ByteString -> AttrPart) -> AState -> AttrParserState -> AttrParserState takePart charP partConstructor nextstate st = case subject st !? offset st of Just c | charP c -> let val = B8.takeWhile charP (B8.drop (offset st) (subject st)) in st{ aState = nextstate, offset = offset st + B8.length val, parts = partConstructor val : parts st } _ -> st{ aState = FAIL } skipWhile :: (Char -> Bool) -> AttrParserState -> AttrParserState skipWhile charP st = case B8.findIndex (not . charP) (B8.drop (offset st) (subject st)) of Nothing -> st{ offset = B8.length (subject st) } Just i -> st{ offset = offset st + i } attrPartsToAttr :: [AttrPart] -> Attr attrPartsToAttr = go where go [] = Attr [] go (AttrId bs : xs) = (<> Attr [("id",bs)]) $ go xs go (AttrClass bs : xs) = (<> Attr [("class",bs)]) $ go xs go zs = case break isAttrKey zs of (vs, AttrKey bs : xs) -> (<> Attr [(bs, mconcat (reverse $ map getAttrVal vs))]) $ go xs _ -> Attr [] -- should not happen isAttrKey (AttrKey _) = True isAttrKey _ = False getAttrVal (AttrValue bs) = bs getAttrVal _ = mempty isNameChar :: Char -> Bool isNameChar c = not (isWs c) && (not (isPunctuation c) || c == ':' || c == '_' || c == '-') isKeyChar :: Char -> Bool isKeyChar c = isAlphaNum c || c == ':' || c == '_' || c == '-' isBareValChar :: Char -> Bool isBareValChar c = isAlphaNum c || c == ':' || c == '_' || c == '-' djot-0.1.2.4/src/Djot/Blocks.hs0000644000000000000000000012014207346545000014245 0ustar0000000000000000{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Strict #-} {-# LANGUAGE DeriveDataTypeable #-} module Djot.Blocks ( parseDoc , toIdentifier ) where import Prelude hiding (div) import Text.Read (readMaybe) import Data.Maybe (fromMaybe) import Data.Char (ord, isAsciiLower, isAsciiUpper, isAscii, isAlphaNum, isDigit) import Data.Foldable as F import Djot.Parse import Djot.AST import Djot.Inlines (parseInlines, parseTableCells) import Djot.Options (ParseOptions(..), SourcePosOption(..)) import Djot.Attributes (parseAttributes, AttrParserState, AttrParseResult(..)) import Data.Sequence (Seq) import qualified Data.Sequence as Seq import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Data.ByteString (ByteString) import Control.Monad (replicateM_, void, mzero, unless, when, guard, foldM) import Data.List.NonEmpty (NonEmpty(..)) import Data.List (intercalate) import qualified Data.List.NonEmpty as NonEmpty import Data.Set (Set) import qualified Data.Set as Set import Control.Applicative import Data.Typeable (Typeable) -- import Debug.Trace parseDoc :: ParseOptions -> ByteString -> Either String Doc parseDoc opts bs = do case parse pDoc PState{ psParseOptions = opts , psContainerStack = NonEmpty.fromList [emptyContainer{ containerSpec = docSpec }] , psReferenceMap = mempty , psAutoReferenceMap = mempty , psNoteMap = mempty , psAttributes = mempty , psAttrParserState = Nothing , psIds = mempty , psAutoIds = mempty , psLastColumnPrevLine = 0 , psLastLine = 1 } [Chunk{ chunkLine = 1, chunkColumn = 1, chunkBytes = bs }] of Just doc -> Right doc Nothing -> Left "Parse failure." data BlockType = Normal | ListItem | CaptionBlock | Document deriving (Show, Eq) data BlockSpec = BlockSpec { -- | Descriptive name blockName :: String , -- | Type of block blockType :: BlockType -- | Parser for start of this block type , blockStart :: P () -- | Parser that must return True if this block is to continue , blockContinue :: Container -> P Bool -- | Just blockType if it can contain that type of block , blockContainsBlock :: Maybe BlockType -- | True if it can accept text lines , blockContainsLines :: Bool -- | Parser that runs when block is closed, possibly -- updating the container. , blockClose :: Container -> P Container -- | Parser that runs when the document is closed, creating the -- block AST element. , blockFinalize :: Container -> Blocks } docSpec :: BlockSpec docSpec = BlockSpec { blockName = "Doc" , blockType = Document , blockStart = mzero , blockContinue = \_ -> pure True , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = pure , blockFinalize = finalizeChildren } listItemSpec :: BlockSpec listItemSpec = BlockSpec { blockName = "ListItem" , blockType = ListItem , blockStart = do ind <- sourceColumn ltypes <- pListStart skipMany spaceOrTab tip :| _ <- psContainerStack <$> getState case blockContainsBlock (containerSpec tip) of Just ListItem -> pure () _ -> addContainer listSpec ind NoData addContainer listItemSpec ind (ListItemData ind ltypes False) , blockContinue = \container -> do True <$ fails (do skipMany spaceOrTab curind <- sourceColumn let liIndent = case containerData container of ListItemData i _ _ -> i _ -> error "Missing ListItemData" tip :| _ <- psContainerStack <$> getState guard (curind <= liIndent) case blockName (containerSpec tip) of "Para" -> void pListStart _ -> pure ()) <|> True <$ followedByBlankLine <|> pure False , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = pure , blockFinalize = finalizeChildren } pListStart :: P [ListType] pListStart = pBulletListStart <|> pDefinitionListStart <|> pOrderedListStart pBulletListStart :: P [ListType] pBulletListStart = do bulletchar <- satisfyByte (\c -> c == '-' || c == '+' || c == '*') followedByWhitespace (do skipMany spaceOrTab asciiChar '[' status <- (Complete <$ byteString "x]") <|> (Complete <$ byteString "X]") <|> (Incomplete <$ byteString " ]") followedByWhitespace pure [Task status]) <|> pure [Bullet bulletchar] pDefinitionListStart :: P [ListType] pDefinitionListStart = do asciiChar ':' followedByWhitespace pure [Definition] groupLists :: Seq Container -> Seq ([ListType], Seq Container) groupLists = snd . foldl' go ([], mempty) where go :: ([ListType], Seq ([ListType], Seq Container)) -> Container -> ([ListType], Seq ([ListType], Seq Container)) go (curtypes, lists) cont = case Seq.viewr lists of Seq.EmptyR -> (getListTypes cont, Seq.singleton (getListTypes cont, Seq.singleton cont)) rest Seq.:> (_, cur) -> let lt = getListTypes cont matchedTypes = [ty | ty <- curtypes, any (ty `matches`) lt] in if null matchedTypes then (getListTypes cont, lists Seq.|> (getListTypes cont, Seq.singleton cont)) -- new list else (matchedTypes, rest Seq.|> (matchedTypes, cur Seq.|> cont)) matches :: ListType -> ListType -> Bool matches (Bullet b1) (Bullet b2) = b1 == b2 matches (Ordered o1) (Ordered o2) = orderedListStyle o1 == orderedListStyle o2 && orderedListDelim o1 == orderedListDelim o2 matches Definition Definition = True matches Task{} Task{} = True matches _ _ = False getListTypes :: Container -> [ListType] getListTypes cont = case containerData cont of ListItemData _ tys _ -> tys _ -> error "Missing ListItemData" pOrderedListStart :: P [ListType] pOrderedListStart = do openParen <- (True <$ asciiChar '(') <|> pure False lookahead $ do skipSome $ skipSatisfyByte (\c -> isAscii c && isAlphaNum c) skipSatisfyByte (\c -> c == '.' || c == ')') stylesAndStarts <- decimalStart <|> romanStart <|> letterStart delimType <- if openParen then LeftRightParen <$ asciiChar ')' else (RightParen <$ asciiChar ')') <|> (RightPeriod <$ asciiChar '.') followedByWhitespace pure $ map (\(style, start) -> Ordered OrderedListAttributes { orderedListStyle = style , orderedListDelim = delimType , orderedListStart = start }) stylesAndStarts where decimalStart = do digits <- some (satisfyByte isDigit) case readMaybe digits of Just n -> pure [(Decimal, n)] Nothing -> mzero letterStart = do c <- satisfyByte (\c -> isAsciiLower c || isAsciiUpper c) if isAsciiLower c then pure [(LetterLower, 1 + (ord c - ord 'a'))] else pure [(LetterUpper, 1 + (ord c - ord 'A'))] romanStart = do (n, lettercase) <- pRomanNumeral let sty = if lettercase == Uppercase then RomanUpper else RomanLower let altsty = if lettercase == Uppercase then LetterUpper else LetterLower pure $ (sty, n) : case n of 1 -> [(altsty, 9)] 5 -> [(altsty, 22)] 10 -> [(altsty, 24)] 50 -> [(altsty, 12)] 100 -> [(altsty, 3)] 500 -> [(altsty, 4)] 1000 -> [(altsty, 13)] _ -> [] data Case = Uppercase | Lowercase deriving (Eq) pRomanNumeral :: P (Int, Case) pRomanNumeral = do let isUpperRomanChar c = c == 'I' || c == 'V' || c == 'X' || c == 'L' || c == 'C' || c == 'D' || c == 'M' let isLowerRomanChar c = c == 'i' || c == 'v' || c == 'x' || c == 'l' || c == 'c' || c == 'd' || c == 'm' let isRomanChar c = isUpperRomanChar c || isLowerRomanChar c lettercase <- lookahead $ do c <- satisfyByte isRomanChar let lettercase = if isUpperRomanChar c then Uppercase else Lowercase skipMany $ skipSatisfyByte $ case lettercase of Uppercase -> isUpperRomanChar Lowercase -> isLowerRomanChar skipSatisfyByte (\d -> d == ')' || d == '.') pure lettercase let rchar uc lc = satisfyByte $ if lettercase == Uppercase then (== uc) else (== lc) let one = rchar 'I' 'i' let five = rchar 'V' 'v' let ten = rchar 'X' 'x' let fifty = rchar 'L' 'l' let hundred = rchar 'C' 'c' let fivehundred = rchar 'D' 'd' let thousand = rchar 'M' 'm' thousands <- (1000 *) . length <$> many thousand ninehundreds <- option 0 $ hundred >> thousand >> return 900 fivehundreds <- option 0 $ 500 <$ fivehundred fourhundreds <- option 0 $ hundred >> fivehundred >> return 400 hundreds <- (100 *) . length <$> many hundred nineties <- option 0 $ ten >> hundred >> return 90 fifties <- option 0 (50 <$ fifty) forties <- option 0 $ ten >> fifty >> return 40 tens <- (10 *) . length <$> many ten nines <- option 0 $ one >> ten >> return 9 fives <- option 0 (5 <$ five) fours <- option 0 $ one >> five >> return 4 ones <- length <$> many one let total = thousands + ninehundreds + fivehundreds + fourhundreds + hundreds + nineties + fifties + forties + tens + nines + fives + fours + ones if total == 0 then mzero else return (total, lettercase) where option defval p = p <|> pure defval listSpec :: BlockSpec listSpec = BlockSpec { blockName = "List" , blockType = Normal , blockStart = mzero -- added in listItemSpec , blockContinue = \_ -> pure True , blockContainsBlock = Just ListItem , blockContainsLines = False , blockClose = pure , blockFinalize = foldMap itemsToList . groupLists . containerChildren } itemsToList :: ([ListType], Seq Container) -> Blocks itemsToList (ltypes, containers) = case containers of Seq.Empty -> mempty _ -> let spacing = case Seq.viewr containers of Seq.EmptyR -> Tight as Seq.:> _ | any itemEndsWithBlank as || any hasChildrenSeparatedWithBlank containers -> Loose _ -> Tight items' = toList items taskListStatus = map getTaskStatus (toList containers) pos = case (Seq.viewl containers, Seq.viewr containers) of (s Seq.:< _, _ Seq.:> e) | containerSourcePos s -> Pos (containerStartLine s) (containerStartColumn s) (containerEndLine e) (containerEndColumn e) _ -> NoPos in addPos pos <$> case ltypes of Bullet{} : _-> bulletList spacing items' Ordered _ : _-> orderedList (chooseOrderedAttr ltypes) spacing items' Definition : _ -> definitionList spacing $ map toDefinition items' Task _ : _ -> taskList spacing $ zip taskListStatus items' [] -> mempty where items = map finalize $ toList containers getTaskStatus cont = case getListTypes cont of ([Task stat] :: [ListType]) -> stat _ -> error "getTaskStatus: wrong shape" -- when ambiguous between roman and lettered list, choose roman if start number is 1, -- otherwise lettered chooseOrderedAttr os = case [at | Ordered at <- os, isRomanStartOne at] of (a:_) -> a _ -> case [at | Ordered at <- os, isLettered at] of (a:_) -> a _ -> case [at | Ordered at <- os] of (a:_) -> a [] -> error "chooseOrderedAttr on empty list" isRomanStartOne at = (orderedListStyle at == RomanUpper || orderedListStyle at == RomanLower) && orderedListStart at == 1 isLettered at = orderedListStyle at == LetterUpper || orderedListStyle at == LetterLower -- | We determine whether a list item ends with a blank line by -- comparing its end line with the end line of its last child. itemEndsWithBlank :: Container -> Bool itemEndsWithBlank li = case Seq.viewr (containerChildren li) of Seq.EmptyR -> False _ Seq.:> lastChild -> containerEndLine li > containerEndLine lastChild -- | We don't count blanks before lists, because -- otherwise it would be impossible to have nested tight lists. hasChildrenSeparatedWithBlank :: Container -> Bool hasChildrenSeparatedWithBlank cont = or $ Seq.zipWith check children (Seq.drop 1 children) where children = (if Definition `elem` liTypes then Seq.drop 1 else id) $ containerChildren cont check x y = (blockName (containerSpec y) /= "List") && (containerStartLine y > containerEndLine x) liTypes = getListTypes cont toDefinition :: Blocks -> (Inlines, Blocks) toDefinition bs = case Seq.viewl bs' of Node _ _ (Para ils) Seq.:< _ -> (ils, Many (Seq.drop 1 bs')) _ -> (mempty, bs) where bs' = unMany bs sectionSpec :: BlockSpec sectionSpec = BlockSpec { blockName = "Section" , blockType = Normal , blockStart = mzero -- these containers are added by headingSpec , blockContinue = \_ -> pure True -- these are closed by headingSpec , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = \container -> do case containerChildren container of h Seq.:<| _ | blockName (containerSpec h) == "Heading" -> do let lev = case containerData container of SectionData n _ -> n _ -> error "Missing SectionData" let ils = case containerData h of HeadingData _ xs -> xs _ -> error "Missing HeadingData" (secid, attr, label) <- do let bs = inlinesToByteString ils let Attr ats = containerAttr container case lookup "id" ats of Just id' -> pure (id', mempty, normalizeLabel bs) Nothing -> do -- generate id from title let generateId (n :: Int) base = do let candidate | n == 0 = base | otherwise = base <> "-" <> B8.pack (show n) ids <- psIds <$> getState if candidate `Set.member` ids then generateId (n+1) base else do updateState $ \st -> st{ psIds = Set.insert candidate (psIds st) , psAutoIds = Set.insert candidate (psAutoIds st) } pure candidate ident <- generateId 0 (toIdentifier bs) pure (ident, mempty, normalizeLabel bs) -- add implicit reference let dest = "#" <> secid updateState $ \st -> st{ psAutoReferenceMap = insertReference label (dest, Attr []) (psAutoReferenceMap st) } pure container{ containerData = SectionData lev (Just secid) , containerAttr = containerAttr container <> attr } _ -> pure container , blockFinalize = \container -> let blocks = finalizeChildren container secid = case containerData container of SectionData _ ident -> ident _ -> error "Missing SectionData" in addSourcePos container $ maybe id (\ident -> addAttr (Attr [("id", ident)])) secid <$> section blocks } blockQuoteSpec :: BlockSpec blockQuoteSpec = BlockSpec { blockName = "BlockQuote" , blockType = Normal , blockStart = do ind <- sourceColumn asciiChar '>' followedByWhitespace skipMany spaceOrTab addContainer blockQuoteSpec ind NoData , blockContinue = \_ -> do skipMany spaceOrTab asciiChar '>' followedByWhitespace pure True , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = pure , blockFinalize = \container -> addSourcePos container $ blockQuote $ finalizeChildren container } tableSpec :: BlockSpec tableSpec = BlockSpec { blockName = "Table" , blockType = Normal , blockStart = do lookahead pRawTableRow ind <- sourceColumn addContainer tableSpec ind (TableData mempty) , blockContinue = \container -> do skipMany spaceOrTab -- TODO: this is inefficient; we parse the inline contents -- twice. Find a better way. let parsedBlankOrCaption = case Seq.viewr (containerText container) of _ Seq.:> c -> not (B8.any (=='|') (chunkBytes c)) Seq.EmptyR -> False (True <$ -- if we just parsed a blank or caption line, no more table rows (guard (not parsedBlankOrCaption) <* lookahead pRawTableRow)) <|> (True <$ followedByBlankLine) <|> (True <$ (skipMany spaceOrTab *> lookahead (asciiChar '^' *> spaceOrTab))) <|> (True <$ guard (not (null (containerChildren container)))) , blockContainsBlock = Just CaptionBlock , blockContainsLines = True , blockClose = \container -> do let lns = containerText container rows <- reverse . snd <$> foldM parseTableRow ([], []) lns pure $ container{ containerData = TableData rows } , blockFinalize = \container -> let rows = case containerData container of TableData rs -> rs _ -> error "Missing TableData" mbCaption = case Seq.viewr (containerChildren container) of Seq.EmptyR -> Nothing _ Seq.:> x -> Just . Caption $ blockFinalize (containerSpec x) x in addSourcePos container $ table mbCaption rows } parseTableRow :: ([Align], [[Cell]]) -> Chunk -> P ([Align], [[Cell]]) parseTableRow (aligns, rows) chunk = case B8.uncons (B8.strip $ chunkBytes chunk) of Just ('|',_) -> do res <- pTableCells aligns chunk case res of Left aligns' -> pure (aligns', case rows of r:rs -> zipWith toHeadCell aligns' r : rs [] -> [] ) Right cells -> pure (aligns, cells : rows) Nothing -> pure (aligns, rows) Just (_,_) -> mzero where toHeadCell align' (Cell _ _ ils) = Cell HeadCell align' ils pTableCells :: [Align] -> Chunk -> P (Either [Align] [Cell]) pTableCells aligns chunk = case parse pTableSeps () [chunk] of Just aligns' -> pure $ Left aligns' Nothing -> do opts <- psParseOptions <$> getState case parseTableCells opts chunk of Right cs -> pure $ Right $ zipWith (Cell BodyCell) (aligns ++ repeat AlignDefault) cs Left _ -> mzero pTableSeps :: Parser () [Align] pTableSeps = do skipMany spaceOrTab asciiChar '|' many pTableSep <* skipMany ws <* eof where pTableSep = do skipMany spaceOrTab start <- (True <$ asciiChar ':') <|> pure False skipSome (asciiChar '-') end <- (True <$ asciiChar ':') <|> pure False skipMany spaceOrTab asciiChar '|' pure $ case (start, end) of (True, True) -> AlignCenter (True, False) -> AlignLeft (False, True) -> AlignRight (False, False) -> AlignDefault pRawTableRow :: P () pRawTableRow = do lookahead $ asciiChar '|' curline <- sourceLine curcolumn <- sourceColumn bs <- restOfLine void $ parseTableRow ([],[]) Chunk{ chunkLine = curline , chunkColumn = curcolumn , chunkBytes = bs } captionSpec :: BlockSpec captionSpec = BlockSpec { blockName = "Caption" , blockType = CaptionBlock , blockStart = do ind <- sourceColumn asciiChar '^' void spaceOrTab addContainer captionSpec ind $ CaptionData ind , blockContinue = \container -> (do skipMany spaceOrTab curind <- sourceColumn let ind = case containerData container of CaptionData i -> i _ -> error "Missing CaptionData" guard (curind > ind) <|> followedByBlankLine pure True) <|> pure False , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = pure , blockFinalize = finalizeChildren } thematicBreakSpec :: BlockSpec thematicBreakSpec = BlockSpec { blockName = "ThematicBreak" , blockType = Normal , blockStart = do let breakChar = skipSatisfyByte (\c -> c == '-' || c == '*') *> skipMany spaceOrTab ind <- sourceColumn breakChar *> breakChar *> breakChar *> skipMany breakChar lookahead endline addContainer thematicBreakSpec ind NoData , blockContinue = \_ -> pure False , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = pure , blockFinalize = \container -> addSourcePos container thematicBreak } headingSpec :: BlockSpec headingSpec = BlockSpec { blockName = "Heading" , blockType = Normal , blockStart = do ind <- sourceColumn lev <- length <$> some (asciiChar '#') followedByWhitespace skipMany spaceOrTab closeContainingSections lev addContainer sectionSpec ind $ SectionData lev Nothing addContainer headingSpec ind $ HeadingData lev mempty , blockContinue = \container -> do do skipMany spaceOrTab let lev = case containerData container of HeadingData n _ -> n _ -> error "Missing HeadingData" (True <$ (do lev' <- length <$> some (asciiChar '#') guard (lev' == lev) skipMany spaceOrTab)) <|> (False <$ do lookahead (asciiChar '#' <|> endline <|> eof)) <|> pure True , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = \container -> do ils <- parseTextLines container let lev = case containerData container of HeadingData n _ -> n _ -> error "Missing HeadingData" pure $ container{ containerData = HeadingData lev ils } , blockFinalize = \container -> let (lev, title) = case containerData container of HeadingData l t -> (l, t) _ -> error "Missing HeadingData" in addSourcePos container $ heading lev title } codeBlockSpec :: BlockSpec codeBlockSpec = BlockSpec { blockName = "CodeBlock" , blockType = Normal , blockStart = do indent <- sourceColumn ticks <- byteStringOf $ asciiChar '`' *> asciiChar '`' *> skipSome (asciiChar '`') skipMany spaceOrTab lang <- (byteStringOf (skipSome $ skipSatisfyByte (\c -> c /= '`' && not (isWs c))) <* skipMany spaceOrTab) <|> pure "" lookahead endline addContainer codeBlockSpec indent (CodeBlockData ticks lang indent) , blockContinue = \container -> do let (ticks, indent) = case containerData container of CodeBlockData t _ i -> (t, i) _ -> error "Missing CodeBlockData" gobbleSpaceToIndent indent (do skipMany spaceOrTab byteString ticks skipMany (asciiChar '`') skipMany spaceOrTab lookahead endline pure False) <|> pure True , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = pure , blockFinalize = \container -> let lang = case containerData container of CodeBlockData _ l _ -> l _ -> error "Missing CodeBlockData" -- drop first line which should be empty bs = foldMap chunkBytes (Seq.drop 1 $ containerText container) in addSourcePos container $ case B8.uncons lang of Just ('=', fmt) -> rawBlock (Format fmt) bs _ -> codeBlock lang bs } divSpec :: BlockSpec divSpec = BlockSpec { blockName = "Div" , blockType = Normal , blockStart = do ind <- sourceColumn colons <- byteStringOf $ asciiChar ':' *> asciiChar ':' *> skipSome (asciiChar ':') skipMany spaceOrTab label <- byteStringOf $ skipMany $ skipSatisfyByte (not . isWs) skipMany spaceOrTab lookahead endline addContainer divSpec ind (DivData colons label) , blockContinue = \container -> (do tip <- getTip -- see jgm/djot.js#109 guard $ blockName (containerSpec tip) /= "CodeBlock" skipMany spaceOrTab let colons = case containerData container of DivData c _ -> c _ -> error "Missing DivData" byteString colons skipMany (asciiChar ':') skipMany spaceOrTab lookahead endline pure False) <|> pure True , blockContainsBlock = Just Normal , blockContainsLines = False , blockClose = pure , blockFinalize = \container -> let label = case containerData container of DivData _ l -> l _ -> error "Missing DivData" -- drop first line which should be empty bls = finalizeChildren container in (if B.null label then id else addAttr (Attr [("class", label)])) <$> addSourcePos container (div bls) } attrSpec :: BlockSpec attrSpec = BlockSpec { blockName = "Attributes" , blockType = Normal , blockStart = do ind <- sourceColumn lookahead $ asciiChar '{' addContainer attrSpec ind $ AttributeData ind , blockContinue = \container -> do let ind = case containerData container of AttributeData i -> i _ -> error "Missing AttributeData" skipMany spaceOrTab curind <- sourceColumn mbapstate <- psAttrParserState <$> getState if curind <= ind then pure False else do let lastLine = case Seq.viewr (containerText container) of _ Seq.:> ll -> chunkBytes ll _ -> mempty case parseAttributes mbapstate lastLine of Done _ -> pure False Partial apstate' -> do updateState $ \st -> st{ psAttrParserState = Just apstate' } pure True Failed _ -> pure True -- not yet: keep going! , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = \container -> do let bs = foldMap chunkBytes $ containerText container case parseAttributes Nothing bs of Done (attr, off) | B8.all isWs (B8.drop off bs) -> do updateState $ \st -> st{ psAttributes = psAttributes st <> attr } pure container | otherwise -> do ils <- parseTextLines container pure $ container{ containerSpec = paraSpec , containerInlines = ils } _ -> do -- could not parse lines as attribute, treat as Para ils <- parseTextLines container pure $ container{ containerSpec = paraSpec , containerInlines = ils } , blockFinalize = const mempty } referenceDefinitionSpec :: BlockSpec referenceDefinitionSpec = BlockSpec { blockName = "ReferenceDefinition" , blockType = Normal , blockStart = do ind <- sourceColumn asciiChar '[' fails (asciiChar '^') -- footnote label <- byteStringOf (some (skipSatisfyByte (\c -> c /= ']' && c /= '\n'))) asciiChar ']' asciiChar ':' skipMany spaceOrTab addContainer referenceDefinitionSpec ind (ReferenceData (normalizeLabel label)) , blockContinue = \_ -> True <$ skipSome spaceOrTab `notFollowedBy` endline , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = \container -> do let label = case containerData container of ReferenceData l -> l _ -> error "Missing ReferenceData" let attr = containerAttr container let dest = B.filter (> 32) . foldMap chunkBytes $ containerText container updateState $ \st -> st{ psReferenceMap = insertReference label (dest, attr) (psReferenceMap st) } pure container , blockFinalize = const mempty } footnoteSpec :: BlockSpec footnoteSpec = BlockSpec { blockName = "Footnote" , blockType = Normal , blockStart = do ind <- sourceColumn asciiChar '[' asciiChar '^' label <- byteStringOf (some (skipSatisfyByte (\c -> c /= ']' && c /= '\n'))) asciiChar ']' asciiChar ':' skipMany spaceOrTab addContainer footnoteSpec ind $ FootnoteData ind (normalizeLabel label) , blockContinue = \container -> (do skipMany spaceOrTab curind <- sourceColumn let ind = case containerData container of FootnoteData i _ -> i _ -> error "Missing FootnoteData" guard (curind > ind) <|> followedByBlankLine pure True) <|> pure False , blockContainsBlock = Just Normal , blockContainsLines = True , blockClose = \container -> do let label = case containerData container of FootnoteData _ l -> l _ -> error "Missing FootnoteData" let bls = finalizeChildren container updateState $ \st -> st{ psNoteMap = insertNote label bls (psNoteMap st) } pure container , blockFinalize = const mempty } paraSpec :: BlockSpec paraSpec = BlockSpec { blockName = "Para" , blockType = Normal , blockStart = do fails followedByBlankLine ind <- sourceColumn addContainer paraSpec ind NoData , blockContinue = \_ -> do skipMany spaceOrTab (False <$ lookahead (endline <|> eof)) <|> pure True , blockContainsBlock = Nothing , blockContainsLines = True , blockClose = \container -> do ils <- parseTextLines container pure $ container{ containerInlines = ils } , blockFinalize = \container -> addSourcePos container . para $ containerInlines container } parseTextLines :: Container -> P Inlines parseTextLines cont = do opts <- psParseOptions <$> getState either error pure . parseInlines opts $ containerText cont emptyContainer :: Container emptyContainer = Container { containerSpec = docSpec , containerChildren = mempty , containerText = mempty , containerInlines = mempty , containerStartLine = 1 , containerStartColumn = 0 , containerEndLine = 1 , containerEndColumn = 0 , containerData = NoData , containerAttr = mempty , containerSourcePos = False } data Container = Container { containerSpec :: BlockSpec , containerChildren :: Seq Container , containerText :: Seq Chunk , containerInlines :: Inlines , containerStartLine :: Int , containerStartColumn :: Int , containerEndLine :: Int , containerEndColumn :: Int , containerData :: ContainerData , containerAttr :: Attr , containerSourcePos :: Bool } data ContainerData = NoData | ListItemData Int [ListType] Bool | SectionData Int (Maybe ByteString) | HeadingData Int Inlines | CodeBlockData ByteString ByteString Int | DivData ByteString ByteString | FootnoteData Int ByteString | TableData [[Cell]] | CaptionData Int | AttributeData Int | ReferenceData ByteString deriving (Show, Eq, Ord, Typeable) data ListType = Bullet Char | Ordered OrderedListAttributes | Definition | Task TaskStatus deriving (Show, Ord, Eq) data PState = PState { psParseOptions :: ParseOptions , psContainerStack :: NonEmpty Container , psReferenceMap :: ReferenceMap , psAutoReferenceMap :: ReferenceMap , psNoteMap :: NoteMap , psAttributes :: Attr , psAttrParserState :: Maybe AttrParserState , psIds :: Set ByteString , psAutoIds :: Set ByteString , psLastColumnPrevLine :: Int , psLastLine :: Int } type P = Parser PState pDoc :: P Doc pDoc = do bls <- pBlocks <* eof st <- getState pure $ Doc{ docBlocks = bls , docFootnotes = psNoteMap st , docReferences = psReferenceMap st , docAutoReferences = psAutoReferenceMap st , docAutoIdentifiers = psAutoIds st } pBlocks :: P Blocks pBlocks = processLines >> finalizeDocument -- | Return value is True if all continuations match. checkContinuations :: NonEmpty Container -> P Bool checkContinuations = go . reverse . NonEmpty.toList where go [] = return True go (c:cs) = do continue <- (Just <$> blockContinue (containerSpec c) c) <|> pure Nothing when (continue == Just False) $ do -- early exit curline <- sourceLine curcol <- sourceColumn updateState $ \st -> st{ psLastLine = curline , psLastColumnPrevLine = curcol - 1 } if fromMaybe False continue then go cs else False <$ -- close len (c:cs) containers replicateM_ (length (c:cs)) closeCurrentContainer {-# INLINE processLines #-} processLines :: P () processLines = do -- check continuations for open containers and close any that don't match containers <- psContainerStack <$> getState allContainersMatch <- checkContinuations containers -- check for new container starts and open if needed newContainersAdded <- tryContainerStarts followedByBlankLine <|> do -- determine if we have a lazy line let isLazy = not (allContainersMatch || newContainersAdded) && blockName (containerSpec (NonEmpty.head containers)) == "Para" when isLazy $ -- restore original containers updateState (\st -> st{ psContainerStack = containers }) tip <- getTip case blockContainsBlock (containerSpec tip) of Just bt | bt == Normal || bt == ListItem -> do -- add a paragraph container skipMany spaceOrTab blockStart paraSpec _ -> pure () !curline <- sourceLine !curcolumn <- sourceColumn restline <- byteStringOf $ do skipMany (skipSatisfyByte (\c -> c /= '\n' && c /= '\r')) !lastcolumn <- sourceColumn optional_ endline updateState $ \st -> st{ psLastColumnPrevLine = lastcolumn - 1 , psLastLine = curline } -- if current container is a line container, add remainder of line modifyContainers $ \(c :| rest) -> if blockContainsLines (containerSpec c) then c{ containerText = containerText c Seq.|> Chunk{ chunkLine = curline , chunkColumn = curcolumn , chunkBytes = restline } } :| rest else c :| rest eof <|> processLines -- True if new container was started tryContainerStarts :: P Bool tryContainerStarts = do (c :| _) <- psContainerStack <$> getState case blockContainsBlock (containerSpec c) of Just bt -> (do nextc <- lookahead (satisfyByte isAscii) next <- if nextc == ' ' || nextc == '\t' then skipMany spaceOrTab *> lookahead (satisfyByte isAscii) else pure nextc case next of '>' -> blockStart blockQuoteSpec '#' -> blockStart headingSpec ':' -> blockStart divSpec <|> blockStart listItemSpec '*' -> blockStart thematicBreakSpec <|> blockStart listItemSpec '-' -> blockStart thematicBreakSpec <|> blockStart listItemSpec '`' -> blockStart codeBlockSpec '{' -> blockStart attrSpec '[' -> blockStart referenceDefinitionSpec <|> blockStart footnoteSpec '|' | bt /= CaptionBlock -> blockStart tableSpec '^' | bt == CaptionBlock -> blockStart captionSpec _ -> blockStart listItemSpec True <$ tryContainerStarts) <|> pure False _ -> pure False -- | Close and finalize containers, returning Blocks. finalizeDocument :: P Blocks finalizeDocument = do cs <- psContainerStack <$> getState case cs of _ :| [] -> closeCurrentContainer >> finalize <$> getTip _ -> closeCurrentContainer >> finalizeDocument {-# INLINE closeCurrentContainer #-} -- | Close container and add to parent container. closeCurrentContainer :: P () closeCurrentContainer = do cs <- psContainerStack <$> getState cs' <- case cs of _ :| [] -> pure cs c :| rest -> do case containerAttr c of Attr as | Just ident <- lookup "id" as -> updateState $ \st -> st{ psIds = Set.insert ident (psIds st) } _ -> pure () c' <- blockClose (containerSpec c) c pure (c':|rest) case cs' of c :| (d:rest) -> updateState $ \st -> st{ psContainerStack = d{ containerChildren = containerChildren d Seq.|> c{ containerEndLine = psLastLine st , containerEndColumn = psLastColumnPrevLine st } } :| rest } c :| [] -> updateState $ \st -> st{ psContainerStack = c{ containerEndLine = psLastLine st , containerEndColumn = psLastColumnPrevLine st } :| [] } {-# INLINE modifyContainers #-} modifyContainers :: (NonEmpty Container -> NonEmpty Container) -> P () modifyContainers f = updateState $ \st -> st{ psContainerStack = f (psContainerStack st) } {-# INLINE addContainer #-} addContainer :: BlockSpec -> Int -> ContainerData -> P () addContainer bspec curcol bdata = do curline <- sourceLine attr <- psAttributes <$> getState opts <- psParseOptions <$> getState let newcontainer = emptyContainer { containerSpec = bspec , containerStartLine = curline , containerStartColumn = curcol , containerEndLine = curline , containerEndColumn = curcol , containerData = bdata , containerAttr = attr , containerSourcePos = sourcePositions opts /= NoSourcePos } unless (blockName bspec == "Attributes") $ updateState $ \st -> st{ psAttributes = mempty } closeInappropriateContainers bspec modifyContainers (newcontainer NonEmpty.<|) closeInappropriateContainers :: BlockSpec -> P () closeInappropriateContainers spec = do -- close containers until we get one that can accept this type of container cs <- psContainerStack <$> getState case cs of c :| _ | blockContainsBlock (containerSpec c) == Just (blockType spec) -> pure () | otherwise -> closeCurrentContainer *> closeInappropriateContainers spec finalize :: Container -> Blocks finalize cont = addAttr (containerAttr cont) <$> blockFinalize (containerSpec cont) cont addSourcePos :: Container -> Blocks -> Blocks addSourcePos cont = if containerSourcePos cont then fmap (addPos (Pos (containerStartLine cont) (containerStartColumn cont) (containerEndLine cont) (containerEndColumn cont))) else id finalizeChildren :: Container -> Blocks finalizeChildren = foldMap finalize . containerChildren -- Gobble as much space as possible up to indent. gobbleSpaceToIndent :: Int -> P () gobbleSpaceToIndent indent = do curindent <- sourceColumn when (curindent < indent) $ optional_ (spaceOrTab *> gobbleSpaceToIndent indent) {-# INLINE getTip #-} -- Get tip of container stack. getTip :: P Container getTip = NonEmpty.head . psContainerStack <$> getState closeContainingSections :: Int -> P () closeContainingSections lev = do tip <- getTip case containerData tip of SectionData lev' _ | lev' >= lev -> closeCurrentContainer >> closeContainingSections lev _ -> pure () -- TODO avoid detour through String toIdentifier :: ByteString -> ByteString toIdentifier bs = if null parts then "sec" else strToUtf8 $ intercalate "-" parts where isSym = (`elem` ("][~!@#$%^&*(){}`,.<>\\|=+/" :: [Char])) parts = words $ map (\c -> if isSym c then ' ' else c) $ utf8ToStr bs djot-0.1.2.4/src/Djot/Djot.hs0000644000000000000000000004655607346545000013750 0ustar0000000000000000{-# LANGUAGE TupleSections #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE StrictData #-} module Djot.Djot ( renderDjot , RenderOptions(..) ) where import Djot.AST import Djot.Options (RenderOptions(..)) import Data.Char (ord, chr) import Djot.Parse (utf8ToStr) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as B8 import qualified Data.Set as Set import Data.Set (Set) import qualified Data.Sequence as Seq import qualified Data.Map.Strict as M import Data.Maybe (fromMaybe) import Data.List (sortOn, intersperse, transpose) import Control.Monad import Control.Monad.State import qualified Data.Foldable as F import Text.DocLayout hiding (Doc) import qualified Text.DocLayout as Layout import Data.Text (Text) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8With) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.IntMap.Strict as IntMap renderDjot :: RenderOptions -> Doc -> Layout.Doc Text renderDjot opts doc = evalState (do body <- toLayout (docBlocks doc) refs <- gets referenceMap >>= toReferences notes <- toNotes pure $ body $$ refs $$ notes <> cr) BState{ noteMap = docFootnotes doc , noteOrder = mempty , referenceMap = docReferences doc , autoIds = docAutoIdentifiers doc , afterSpace = True , nestings = IntMap.fromList -- anything not in this list -- will ALWAYS get {}: [(ord '_', 0) ,(ord '*', 0) ,(ord '~', 0) ,(ord '\'', 0) ,(ord '"', 0) ,(ord '^', 0)] , lastBullet = Nothing , options = opts } data BState = BState { noteMap :: NoteMap , noteOrder :: M.Map ByteString Int , referenceMap :: ReferenceMap , autoIds :: Set ByteString , afterSpace :: Bool , nestings :: IntMap.IntMap Int , lastBullet :: Maybe Char , options :: RenderOptions } toReferences :: ReferenceMap -> State BState (Layout.Doc Text) toReferences (ReferenceMap refs) = (<> cr) . vcat <$> mapM toReference (M.toList refs) toReference :: (ByteString, (ByteString, Attr)) -> State BState (Layout.Doc Text) toReference (label, (url, attr)) = do attr' <- toLayout attr let ref = "[" <> literal (fromUtf8 label) <> "]:" <+> literal (fromUtf8 url) pure $ attr' $$ ref toNotes :: State BState (Layout.Doc Text) toNotes = do noterefs <- gets noteOrder allLabels <- gets (M.keys . unNoteMap . noteMap) let sortedLabels = sortOn (`M.lookup` noterefs) allLabels (<> cr) . vsep <$> mapM toNote sortedLabels toNote :: ByteString -> State BState (Layout.Doc Text) toNote label = do notes <- gets noteMap case lookupNote label notes of Nothing -> pure mempty Just bls -> hang 4 (toNoteRef label <> ":" <> space) <$> toLayout bls fromUtf8 :: ByteString -> Text fromUtf8 = decodeUtf8With lenientDecode data EscapeContext = Normal {-# INLINE escapeDjot #-} escapeDjot :: EscapeContext -> ByteString -> Text escapeDjot Normal bs | B8.any escapable bs = T.pack. go . utf8ToStr $ bs | otherwise = fromUtf8 bs where escapable c = c == '[' || c == ']' || c == '<' || c == '>' || c == '$' || c == '!' || c == '{' || c == '}' || c == ':' || c == '-' || c == '^' || c == '~' || c == '*' || c == '_' || c == '\''|| c == '"' || c == '.' || c == '|' || c == '`' || c == '\\' go [] = [] go ('$':c:cs) | c == '`' = '\\' : '$' : c : go cs | otherwise = '$' : go (c : cs) go ('-':cs) = case cs of '-':_ -> '\\' : '-' : go cs _ -> '-' : go cs go ('.':cs) = case cs of '.':'.':_ -> '\\' : '.' : go cs _ -> '.' : go cs go (c:':':cs) | c /= ']' , case cs of [] -> True (' ':_) -> True _ -> False = (if escapable c then ('\\' :) else id) $ c : ':' : go cs go (c:cs) | escapable c = '\\' : c : go cs | otherwise = c : go cs newtype BlockAttr = BlockAttr Attr formatAttrPart :: (ByteString, ByteString) -> Layout.Doc Text formatAttrPart ("id",ident) = literal ("#" <> fromUtf8 ident) formatAttrPart ("class", classes') = hsep $ map (("." <>) . literal) $ T.words $ fromUtf8 classes' formatAttrPart (k,v) = literal (fromUtf8 k) <> "=" <> doubleQuotes (literal (escapeDjot Normal v)) {-# SPECIALIZE toLayout :: Blocks -> State BState (Layout.Doc Text) #-} {-# SPECIALIZE toLayout :: Inlines -> State BState (Layout.Doc Text) #-} {-# SPECIALIZE toLayout :: Attr -> State BState (Layout.Doc Text) #-} class ToLayout a where toLayout :: a -> State BState (Layout.Doc Text) instance ToLayout Inlines where toLayout = fmap F.fold . mapM toLayout . unMany instance ToLayout Blocks where toLayout = fmap F.fold . mapM toLayout . unMany instance ToLayout Attr where toLayout (Attr kvs) = pure $ if isEmpty contents then mempty else "{" <> contents <> "}" where contents = hsep (map formatAttrPart kvs) instance ToLayout BlockAttr where toLayout (BlockAttr (Attr kvs)) = pure $ if isEmpty contents then mempty else hang 1 "{" (contents <> "}") where contents = hsep (map formatAttrPart kvs) instance ToLayout (Node Block) where toLayout (Node _pos attr bl) = ($$) <$> (case bl of -- don't print an id that was generated implicitly Heading{} -> do autoids <- gets autoIds let Attr as = attr toLayout $ BlockAttr $ Attr [(k,v) | (k,v) <- as , not (k == "id" && v `Set.member` autoids)] Section{} -> do autoids <- gets autoIds let Attr as = attr toLayout $ BlockAttr $ Attr [(k,v) | (k,v) <- as , not (k == "id" && v `Set.member` autoids)] _ -> toLayout (BlockAttr attr)) <*> (($$ blankline) <$> case bl of Para ils -> toLayout ils Heading lev ils -> do contents <- toLayout ils pure $ literal (T.replicate lev "#") <+> contents Section bls -> ($$ blankline) <$> toLayout bls ThematicBreak -> pure $ literal "* * * *" BulletList listSpacing items -> do lastb <- gets lastBullet let bullet = case lastb of Just '+' -> "-" Just '-' -> "+" _ -> "-" (case listSpacing of Tight -> vcat . map chomp Loose -> vsep) <$> mapM (fmap (hang 2 (bullet <> space)) . toLayout) items OrderedList listAttr listSpacing items -> (case listSpacing of Tight -> vcat . map chomp Loose -> vsep) <$> zipWithM (toOrderedListItem listAttr) [(orderedListStart listAttr)..] items DefinitionList listSpacing items -> (case listSpacing of Tight -> vcat . map chomp Loose -> vsep) <$> mapM toDefinitionListItem items TaskList listSpacing items -> (case listSpacing of Tight -> vcat . map chomp Loose -> vsep) <$> mapM toTaskListItem items Div bls -> do let nestedDivs = computeDivNestingLevel bls contents <- toLayout bls let colons = literal (T.replicate (nestedDivs + 3) ":") pure $ colons $$ contents $$ colons BlockQuote bls -> if bls == mempty then pure ">" else prefixed "> " <$> toLayout bls CodeBlock lang bs -> do let longesttickline = case B8.lines bs of [] -> 0 ls -> maximum $ map (B8.length . B8.takeWhile (=='`')) ls let numticks = max 3 longesttickline let ticks = literal $ T.replicate numticks "`" let lang' = if lang == mempty then mempty else literal (fromUtf8 lang) pure $ ticks <+> lang' $$ literal (fromUtf8 bs) $$ ticks Table mbCaption rows -> do caption <- case mbCaption of Nothing -> pure mempty Just (Caption bls) -> hang 2 ("^" <> space) <$> toLayout bls body <- toTable rows pure $ body $+$ caption RawBlock (Format "djot") bs -> pure $ literal (fromUtf8 bs) RawBlock _ _ -> pure mempty) <* modify' (\st -> st{ afterSpace = True -- Handle case of one bullet list right after -- another; we need to change the bullet to -- start a new list: , lastBullet = case bl of BulletList{} -> case lastBullet st of Just '-' -> Just '+' Just '+' -> Just '-' _ -> Just '-' _ -> Nothing }) toTable :: [[Cell]] -> State BState (Layout.Doc Text) toTable [] = pure "|--|" -- minimal empty table toTable rows = do let getCellContents (Cell hd al ils) = ((hd, al),) <$> toLayout ils rowContents <- mapM (mapM getCellContents) rows let colwidths = map (maximum . map (offset . snd)) (transpose rowContents) let toCell width ((_,align), d) = (case align of AlignLeft -> lblock AlignRight -> rblock AlignCenter -> cblock AlignDefault -> lblock) width d let mkRow ds = hcat $ vfill "| " : intersperse (vfill " | ") ds ++ [vfill " |"] let mkLines ds = hcat $ vfill "|" : intersperse (vfill "|") ds ++ [vfill "|"] let toUnderline width ((_,al),_) = literal $ case al of AlignLeft -> ":" <> T.replicate (width + 1) "-" AlignRight -> T.replicate (width + 1) "-" <> ":" AlignCenter -> ":" <> T.replicate width "-" <> ":" AlignDefault -> T.replicate width "-" let initialSep = case rowContents of cells@(((BodyCell,al),_):_):_ | al /= AlignDefault -> mkLines (zipWith toUnderline colwidths cells) _ -> mempty let toRow cells = let isHeader = case cells of ((HeadCell,_),_) : _ -> True _ -> False in mkRow (zipWith toCell colwidths cells) $$ if isHeader then mkLines (zipWith toUnderline colwidths cells) else mempty pure $ initialSep $$ vcat (map toRow rowContents) toDefinitionListItem :: (Inlines, Blocks) -> State BState (Layout.Doc Text) toDefinitionListItem (term, def) = do term' <- toLayout term def' <- toLayout def pure $ hang 2 (":" <> space) $ term' $+$ def' toTaskListItem :: (TaskStatus, Blocks) -> State BState (Layout.Doc Text) toTaskListItem (status, bls) = do contents <- toLayout bls let marker = case status of Incomplete -> "- [ ]" <> space Complete -> "- [X]" <> space pure $ hang 2 marker contents toOrderedListItem :: OrderedListAttributes -> Int -> Blocks -> State BState (Layout.Doc Text) toOrderedListItem listAttr num bs = do contents <- toLayout bs let marker = formatOrderedListMarker listAttr num pure $ hang (offset marker + 1) (marker <> space) contents formatOrderedListMarker :: OrderedListAttributes -> Int -> Layout.Doc Text formatOrderedListMarker listAttr = addDelims (orderedListDelim listAttr) . formatNumber (orderedListStyle listAttr) addDelims :: OrderedListDelim -> Layout.Doc Text -> Layout.Doc Text addDelims RightPeriod d = d <> "." addDelims RightParen d = d <> ")" addDelims LeftRightParen d = "(" <> d <> ")" formatNumber :: OrderedListStyle -> Int -> Layout.Doc Text formatNumber Decimal n = literal (T.pack (show n)) formatNumber LetterUpper n = literal (T.singleton (chr (ord 'A' + n - 1))) formatNumber LetterLower n = literal (T.singleton (chr (ord 'a' + n - 1))) formatNumber RomanUpper n = literal $ toRomanNumeral n formatNumber RomanLower n = literal $ T.toLower (toRomanNumeral n) -- | Convert number < 4000 to uppercase roman numeral. (from pandoc) toRomanNumeral :: Int -> T.Text toRomanNumeral x | x >= 4000 || x < 0 = "?" | x >= 1000 = "M" <> toRomanNumeral (x - 1000) | x >= 900 = "CM" <> toRomanNumeral (x - 900) | x >= 500 = "D" <> toRomanNumeral (x - 500) | x >= 400 = "CD" <> toRomanNumeral (x - 400) | x >= 100 = "C" <> toRomanNumeral (x - 100) | x >= 90 = "XC" <> toRomanNumeral (x - 90) | x >= 50 = "L" <> toRomanNumeral (x - 50) | x >= 40 = "XL" <> toRomanNumeral (x - 40) | x >= 10 = "X" <> toRomanNumeral (x - 10) | x == 9 = "IX" | x >= 5 = "V" <> toRomanNumeral (x - 5) | x == 4 = "IV" | x >= 1 = "I" <> toRomanNumeral (x - 1) | otherwise = "" instance ToLayout (Node Inline) where toLayout (Node _pos attr il) = (<>) <$> case il of Str bs -> do let fixSmart = T.replace "\x2014" "---" . T.replace "\x2013" "--" . T.replace "\x2026" "..." . T.replace "\x2019" "'" . T.replace "\x201C" "\"" let chunks = T.groupBy (\c d -> (c /= ' ' && d /= ' ') || (c == ' ' && d == ' ')) (fixSmart $ escapeDjot Normal bs) let toChunk ch = case T.uncons ch of Just (' ', rest) -> afterBreak "{}" <> space <> literal rest _ -> literal ch pure $ hcat $ map toChunk chunks SoftBreak -> do opts <- gets options pure $ if preserveSoftBreaks opts then cr else space HardBreak -> pure (literal "\\" <> cr) NonBreakingSpace -> pure "\\ " Emph ils -> surround '_' ils Strong ils -> surround '*' ils Highlight ils -> surround '=' ils Insert ils -> surround '+' ils Delete ils -> surround '-' ils Superscript ils -> surround '^' ils Subscript ils -> surround '~' ils Quoted SingleQuotes ils -> surround '\'' ils Quoted DoubleQuotes ils -> surround '"' ils Verbatim bs -> pure $ toVerbatimSpan bs Math mt bs -> do let suffix = toVerbatimSpan bs let prefix = case mt of DisplayMath -> "$$" InlineMath -> "$" pure $ prefix <> suffix Symbol bs -> pure $ ":" <> literal (fromUtf8 bs) <> ":" Span ils -> do contents <- toLayout ils pure $ "[" <> contents <> "]" <> case attr of -- there must be attributes for it to be a span Attr [] -> "{}" _ -> mempty Link ils target -> do contents <- toLayout ils let suffix = toLinkSuffix target contents pure $ "[" <> contents <> "]" <> suffix Image ils target -> do contents <- toLayout ils let suffix = toLinkSuffix target contents pure $ "![" <> contents <> "]" <> suffix EmailLink email -> pure $ "<" <> literal (fromUtf8 email) <> ">" UrlLink url -> pure $ "<" <> literal (fromUtf8 url) <> ">" RawInline (Format "djot") bs -> pure $ literal (fromUtf8 bs) RawInline _ _ -> pure mempty FootnoteReference label -> do order <- gets noteOrder case M.lookup label order of Nothing -> modify' $ \st -> st{ noteOrder = M.insert label (M.size order + 1) order } Just _ -> pure () pure $ toNoteRef label <*> toLayout attr <* modify' (\st -> st{ afterSpace = case il of Str bs | isWhite (B8.takeEnd 1 bs) -> True SoftBreak -> True HardBreak -> True NonBreakingSpace -> True _ -> False }) toLinkSuffix :: Target -> Layout.Doc Text -> Layout.Doc Text toLinkSuffix (Direct url) _ = literal $ "(" <> fromUtf8 url <> ")" toLinkSuffix (Reference label) d | render Nothing d == fromUtf8 label = literal "[]" | otherwise = literal $ "[" <> fromUtf8 label <> "]" toVerbatimSpan :: ByteString -> Layout.Doc Text toVerbatimSpan bs = ticks <> (if startsWithTick then " " else mempty) <> literal (fromUtf8 bs) <> (if endsWithTick then " " else mempty) <> ticks where startsWithTick = B8.take 1 bs == "`" endsWithTick = B8.takeEnd 1 bs == "`" ticks = literal $ T.replicate (maxticks + 1) "`" maxticks = fst $ B8.foldl' scanTicks (0,0) bs scanTicks (longest, theseticks) '`' = (max (theseticks + 1) longest, theseticks + 1) scanTicks (longest, _) _ = (longest, 0) isWhite :: ByteString -> Bool isWhite " " = True isWhite "\t" = True isWhite _ = False surround :: Char -> Inlines -> State BState (Layout.Doc Text) surround c ils = do let startBeforeSpace = case Seq.viewl (unMany ils) of Node _pos _ (Str bs) Seq.:< _ -> isWhite (B8.take 1 bs) _ -> False modify' $ \st -> st{ nestings = IntMap.adjust (+ 1) (ord c) (nestings st)} contents <- toLayout ils modify' $ \st -> st{ nestings = IntMap.adjust (\x -> x - 1) (ord c) (nestings st)} endAfterSpace <- gets afterSpace nestingLevel <- gets (fromMaybe 1 . IntMap.lookup (ord c) . nestings) let core = char c <> contents <> char c pure $ if nestingLevel == 0 && not (startBeforeSpace || endAfterSpace) && not (null ils) then core else char '{' <> core <> char '}' toNoteRef :: ByteString -> Layout.Doc Text toNoteRef bs = literal ("[^" <> fromUtf8 bs <> "]") computeDivNestingLevel :: Blocks -> Int computeDivNestingLevel = foldr go 0 . unMany where go (Node _pos _ (Div bls')) n = max (n + 1) (foldr go (n + 1) (unMany bls')) go _ n = n djot-0.1.2.4/src/Djot/Html.hs0000644000000000000000000003422007346545000013735 0ustar0000000000000000{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE Strict #-} module Djot.Html ( inlinesToByteString , renderHtml , RenderOptions(..) ) where import Djot.AST import Data.Tuple (swap) import Djot.Parse (strToUtf8) import Djot.Options (RenderOptions(..)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Data.ByteString.Builder (Builder, byteString, word8, intDec) import qualified Data.Sequence as Seq import qualified Data.Map.Strict as M import Data.Maybe (fromMaybe) import Data.List (sort) import Control.Monad.State import qualified Data.Foldable as F renderHtml :: RenderOptions -> Doc -> Builder renderHtml opts doc = evalState ( (<>) <$> toBuilder (docBlocks doc) <*> toNotes ) BState{ noteMap = docFootnotes doc , noteRefs = mempty , renderedNotes = mempty , referenceMap = docReferences doc <> docAutoReferences doc , options = opts } toNotes :: State BState Builder toNotes = do st <- get let noterefs = noteRefs st let numnotes = M.size noterefs let revnoterefs = sort $ map swap $ M.toList noterefs let toNote (num, lab) = let num' = B8.pack (show num) in inTags "li" NoPos (Attr [("id", "fn" <> num')]) ("\n" <> fromMaybe mempty (M.lookup lab (renderedNotes st))) <> "\n" if numnotes < 1 then pure mempty else pure $ inTags "section" NoPos (Attr [("role", "doc-endnotes")]) ("\n" <> singleTag "hr" NoPos mempty <> "\n" <> inTags "ol" NoPos mempty ("\n" <> foldMap toNote revnoterefs) <> "\n") <> "\n" addBackref :: ByteString -> Blocks -> Blocks addBackref num (Many bls) = Many $ case Seq.viewr bls of rest Seq.:> Node pos attr (Para ils) -> rest Seq.|> Node pos attr (Para (ils <> backlink)) _ -> bls Seq.|> Node NoPos mempty (Para backlink) where backlink = Many $ Seq.singleton $ Node NoPos (Attr [("role", "doc-backlink")]) (Link (str (strToUtf8 "\8617\65038")) (Direct ("#fnref" <> num))) {-# INLINE escapeHtml #-} escapeHtml :: ByteString -> Builder escapeHtml bs = if hasEscapable bs then B.foldl' go mempty bs else byteString bs where hasEscapable = B.any (\w -> w == 38 || w == 60 || w == 62) go b 38 = b <> byteString "&" go b 60 = b <> byteString "<" go b 62 = b <> byteString ">" go b c = b <> word8 c {-# INLINE escapeHtmlAttribute #-} escapeHtmlAttribute :: ByteString -> Builder escapeHtmlAttribute bs = if hasEscapable bs then B.foldl' go mempty bs else byteString bs where hasEscapable = B.any (\w -> w == 38 || w == 60 || w == 62 || w == 34) go b 38 = b <> byteString "&" go b 60 = b <> byteString "<" go b 62 = b <> byteString ">" go b 34 = b <> byteString """ go b c = b <> word8 c data BState = BState { noteMap :: NoteMap , noteRefs :: M.Map ByteString Int , renderedNotes :: M.Map ByteString Builder , referenceMap :: ReferenceMap , options :: RenderOptions } {-# SPECIALIZE toBuilder :: Blocks -> State BState Builder #-} {-# SPECIALIZE toBuilder :: Inlines -> State BState Builder #-} class ToBuilder a where toBuilder :: a -> State BState Builder instance ToBuilder Inlines where toBuilder = fmap F.fold . mapM toBuilder . unMany instance ToBuilder Blocks where toBuilder = fmap F.fold . mapM toBuilder . unMany instance ToBuilder (Node Block) where toBuilder (Node pos attr bl) = let addNl = (<> "\n") in case bl of Para ils -> addNl . inTags "p" pos attr <$> toBuilder ils Heading lev ils -> let tag = case lev of 1 -> "h1" 2 -> "h2" 3 -> "h3" 4 -> "h4" 5 -> "h5" 6 -> "h6" _ -> "p" in addNl . inTags tag pos attr <$> toBuilder ils Section bls -> do contents <- toBuilder bls pure $ addNl $ inTags "section" pos attr $ "\n" <> contents ThematicBreak -> pure $ addNl $ singleTag "hr" pos attr BulletList listSpacing items -> addNl . inTags "ul" pos attr . ("\n" <>) . mconcat <$> mapM toLi items where toLi bls = addNl . inTags "li" NoPos mempty . ("\n" <>) <$> toItemContents listSpacing bls OrderedList listAttr listSpacing items -> addNl . inTags "ol" pos (Attr [("start", strToUtf8 (show start)) | start /= 1] <> Attr [("type", typ) | typ /= "1"] <> attr) . ("\n" <>) . mconcat <$> mapM toLi items where typ = case orderedListStyle listAttr of Decimal -> "1" LetterUpper -> "A" LetterLower -> "a" RomanUpper -> "I" RomanLower -> "i" start = orderedListStart listAttr toLi bls = addNl . inTags "li" NoPos mempty . ("\n" <>) <$> toItemContents listSpacing bls DefinitionList listSpacing defs -> addNl . inTags "dl" pos attr . ("\n" <>) . mconcat <$> mapM (toDefinition listSpacing) defs TaskList listSpacing items -> addNl . inTags "ul" pos (Attr [("class", "task-list")] <> attr) . ("\n" <>) . mconcat <$> mapM (toTaskListItem listSpacing) items Div bls -> addNl . inTags "div" pos attr . ("\n" <>) <$> toBuilder bls BlockQuote bls -> addNl . inTags "blockquote" pos attr . ("\n" <>) <$> toBuilder bls CodeBlock lang bs -> pure $ inTags "pre" pos attr (inTags "code" NoPos codeattr (escapeHtml bs)) <> "\n" where codeattr = if B.null lang then mempty else Attr [("class", "language-" <> lang)] Table mbCaption rows -> do rows' <- mapM toRow rows capt <- case mbCaption of Nothing -> pure mempty Just (Caption bs) -> addNl . inTags "caption" NoPos mempty <$> case F.toList (unMany bs) of [Node _pos at (Para ils)] | at == mempty -> toBuilder ils _ -> ("\n" <>) <$> toBuilder bs pure $ addNl . inTags "table" pos attr . ("\n" <>) $ capt <> mconcat rows' RawBlock (Format "html") bs -> pure $ byteString bs RawBlock _ _ -> pure mempty toRow :: [Cell] -> State BState Builder toRow cells = (<> "\n") . inTags "tr" NoPos mempty . ("\n" <>) . mconcat <$> mapM toCell cells toCell :: Cell -> State BState Builder toCell (Cell cellType align ils) = (<> "\n") . inTags (if cellType == HeadCell then "th" else "td") NoPos attr <$> toBuilder ils where attr = Attr $ case align of AlignDefault -> [] AlignLeft -> [("style", "text-align: left;")] AlignRight -> [("style", "text-align: right;")] AlignCenter -> [("style", "text-align: center;")] toItemContents :: ListSpacing -> Blocks -> State BState Builder toItemContents listSpacing = fmap F.fold . mapM go . unMany where go (Node pos attr bl) = case bl of Para ils | listSpacing == Tight -> if attr == mempty then (<> "\n") <$> toBuilder ils else (<> "\n") . inTags "span" pos attr <$> toBuilder ils | otherwise -> toBuilder (Node pos attr bl) _ -> toBuilder (Node pos attr bl) toTaskListItem :: ListSpacing -> (TaskStatus, Blocks) -> State BState Builder toTaskListItem listSpacing (status, bs) = do body <- case Seq.viewl $ unMany bs of Node pos attr (Para ils) Seq.:< rest -> toItemContents listSpacing (Many (Node pos attr (Para (rawInline (Format "html") ("")) Seq.<| rest)) _ -> toBuilder $ rawBlock (Format "html") input <> bs pure $ inTags "li" NoPos (Attr [("class", if status == Complete then "checked" else "unchecked")]) ("\n" <> body) <> "\n" where inputattr = " type=\"checkbox\"" <> if status == Complete then " checked=\"\"" else "" input = " inputattr <> " />" toDefinition :: ListSpacing -> (Inlines, Blocks) -> State BState Builder toDefinition listSpacing (term, defn) = (<>) <$> ((<> "\n") . inTags "dt" NoPos mempty <$> toBuilder term) <*> ((<> "\n") . inTags "dd" NoPos mempty . ("\n" <>) <$> toItemContents listSpacing defn) instance ToBuilder (Node Inline) where toBuilder (Node pos attr il) = case il of Str bs -> case attr of Attr [] | pos == NoPos -> pure $ escapeHtml bs _ -> pure $ inTags "span" pos attr $ escapeHtml bs SoftBreak -> do opts <- gets options pure $ word8 $ if preserveSoftBreaks opts then 10 else 32 HardBreak -> pure $ singleTag "br" NoPos attr <> "\n" NonBreakingSpace -> pure $ byteString " " Emph ils -> inTags "em" pos attr <$> toBuilder ils Strong ils -> inTags "strong" pos attr <$> toBuilder ils Highlight ils -> inTags "mark" pos attr <$> toBuilder ils Insert ils -> inTags "ins" pos attr <$> toBuilder ils Delete ils -> inTags "del" pos attr <$> toBuilder ils Superscript ils -> inTags "sup" pos attr <$> toBuilder ils Subscript ils -> inTags "sub" pos attr <$> toBuilder ils Quoted SingleQuotes ils -> inSingleQuotes <$> toBuilder ils Quoted DoubleQuotes ils -> inDoubleQuotes <$> toBuilder ils Verbatim bs -> pure $ inTags "code" pos attr (escapeHtml bs) Math DisplayMath bs -> pure $ inTags "span" pos (Attr [("class", "math display")] <> attr) ("\\[" <> escapeHtml bs <> "\\]") Math InlineMath bs -> pure $ inTags "span" pos (Attr [("class", "math inline")] <> attr) ("\\(" <> escapeHtml bs <> "\\)") Symbol bs -> pure $ inTags "span" pos (Attr [("class", "symbol")] <> attr) (":" <> escapeHtml bs <> ":") Span ils -> inTags "span" pos attr <$> toBuilder ils Link ils target -> do attr' <- case target of Direct u -> pure $ Attr [("href", u)] Reference label -> do rm <- gets referenceMap case lookupReference label rm of Nothing -> pure $ Attr [("href", "")] Just (u, Attr as) -> pure $ Attr (("href",u):as) inTags "a" pos (attr' <> attr) <$> toBuilder ils Image ils target -> do attr' <- case target of Direct u -> pure $ Attr [("src", u)] Reference label -> do rm <- gets referenceMap case lookupReference label rm of Nothing -> pure $ Attr [("src", "")] Just (u, Attr as) -> pure $ Attr (("src",u):as) pure $ singleTag "img" pos (Attr [("alt", inlinesToByteString ils)] <> attr' <> attr) EmailLink email -> toBuilder (Node pos attr (Link (str email) (Direct ("mailto:" <> email)))) UrlLink url -> toBuilder (Node pos attr (Link (str url) (Direct url))) RawInline (Format "html") bs -> pure $ byteString bs RawInline _ _ -> pure mempty FootnoteReference label -> do noterefs <- gets noteRefs notemap <- gets noteMap num <- case M.lookup label noterefs of Just num -> pure num Nothing -> do let num = M.size noterefs + 1 modify $ \st -> st{ noteRefs = M.insert label num noterefs } renderedNotesMap <- gets renderedNotes case M.lookup label renderedNotesMap of Just _ -> pure () Nothing -> do -- render the note and add to renderedNotes let num' = B8.pack (show num) rendered <- maybe (toBuilder $ addBackref num' (mempty :: Blocks)) (toBuilder . addBackref num') (lookupNote label notemap) modify $ \st -> st{ renderedNotes = M.insert label rendered (renderedNotes st) } pure num let num' = B8.pack $ show num pure $ inTags "a" pos (Attr [("id", "fnref" <> num'), ("href", "#fn" <> num'), ("role", "doc-noteref")] <> attr) $ inTags "sup" pos mempty (escapeHtml num') {-# INLINE inTags #-} inTags :: ByteString -> Pos -> Attr -> Builder -> Builder inTags tag pos attr contents = "<" <> byteString tag <> posToBuilder pos <> attrToBuilder attr <> ">" <> contents <> " byteString tag <> ">" {-# INLINE singleTag #-} singleTag :: ByteString -> Pos -> Attr -> Builder singleTag tag pos attr = "<" <> byteString tag <> posToBuilder pos <> attrToBuilder attr <> ">" {-# INLINE attrToBuilder #-} attrToBuilder :: Attr -> Builder attrToBuilder (Attr pairs) = foldMap go pairs where go (k,v) = " " <> byteString k <> "=\"" <> escapeHtmlAttribute v <> "\"" {-# INLINE posToBuilder #-} posToBuilder :: Pos -> Builder posToBuilder NoPos = mempty posToBuilder (Pos sl sc el ec) = " data-pos=\"" <> intDec sl <> ":" <> intDec sc <> "-" <> intDec el <> ":" <> intDec ec <> "\"" inSingleQuotes :: Builder -> Builder inSingleQuotes x = byteString (strToUtf8 "\x2018") <> x <> byteString (strToUtf8 "\x2019") inDoubleQuotes :: Builder -> Builder inDoubleQuotes x = byteString (strToUtf8 "\x201C") <> x <> byteString (strToUtf8 "\x201D") djot-0.1.2.4/src/Djot/Inlines.hs0000644000000000000000000004227607346545000014444 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE Strict #-} {-# LANGUAGE BinaryLiterals #-} module Djot.Inlines ( parseInlines , parseTableCells ) where import Data.Char (isAscii, isAlphaNum, isSymbol, isPunctuation) import Control.Monad (guard, when, mzero) import Data.Sequence (Seq) import qualified Data.Sequence as Seq import Data.Set (Set) import qualified Data.Set as Set import Djot.Parse import Djot.Options (ParseOptions(..), SourcePosOption(..)) import Djot.Attributes (pAttributes) import Djot.AST import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Data.ByteString (ByteString) import Data.Foldable as F import Control.Applicative import Data.Maybe (fromMaybe) -- import Debug.Trace {-# INLINE isSpecial #-} isSpecial :: Char -> Bool isSpecial c = c == '[' || c == ']' || c == '<' || c == '>' || c == '$' || c == '!' || c == '{' || c == '}' || c == ':' || c == '=' || c == '+' || c == '-' || c == '^' || c == '~' || c == '*' || c == '_' || c == '\''|| c == '"' || c == '.' || c == '|' || c == '`' || c == '\\'|| c == '\n' || c == '\r' parseInlines :: ParseOptions -> Seq Chunk -> Either String Inlines parseInlines opts chunks = do case parse (pInlines <* eof) ParserState{ mode = NormalMode , activeDelims = mempty , options = opts } (toList (stripEndChunks chunks)) of Just ils -> Right ils Nothing -> Left $ "parseInlines failed on input: " <> show (foldMap chunkBytes chunks) parseTableCells :: ParseOptions -> Chunk -> Either String [Inlines] parseTableCells opts chunk = do case parse (asciiChar '|' *> some (removeFinalWs <$> pInlines <* asciiChar '|') <* skipMany ws <* eof) ParserState{ mode = TableCellMode , activeDelims = mempty , options = opts } [chunk] of Just cells -> Right cells Nothing -> Left $ "parseTableCells failed on input: " <> show chunk removeFinalWs :: Inlines -> Inlines removeFinalWs (Many ils) = Many $ case Seq.viewr ils of rest Seq.:> Node pos attr (Str bs) | B8.takeEnd 1 bs == " " -> case B8.dropWhileEnd (== ' ') bs of "" -> rest bs' -> rest Seq.|> Node pos attr (Str bs') _ -> ils data InlineParseMode = NormalMode | TableCellMode deriving (Show, Ord, Eq) data ParserState = ParserState { mode :: InlineParseMode , activeDelims :: Set Delim , options :: ParseOptions } deriving (Show) data Delim = Delim Bool Char deriving (Show, Ord, Eq) type P = Parser ParserState pInlines :: P Inlines pInlines = skipMany ws *> (mconcat <$> many pInline) pInline :: P Inlines pInline = do sline <- sourceLine scol <- sourceColumn res <- pInline' opts <- options <$> getState (case sourcePositions opts of AllSourcePos -> do eline <- sourceLine ecol <- sourceColumn pure $ addPos (Pos sline scol eline (ecol - 1)) <$> res _ -> pure res) >>= pOptionalAttributes pOptionalAttributes :: Inlines -> P Inlines pOptionalAttributes (Many ils) = pAddAttributes (Many ils) <|> pure (Many ils) pAddAttributes :: Inlines -> P Inlines pAddAttributes (Many ils) = do attr <- mconcat <$> some pAttributes pure $ case attr of Attr [] -> Many ils _ -> case Seq.viewr ils of Seq.EmptyR -> mempty ils' Seq.:> Node pos attr' (Str bs) | B8.any isWs bs -> -- attach attribute to last word let (front, lastword) = B8.breakEnd isWs bs in if B.null lastword then Many ils -- ignore attr after whitespace else let (pos1, pos2) = case pos of NoPos -> (NoPos, NoPos) Pos sl sc el ec -> let frontlen = B8.length (B8.filter (\c -> c < '\128' || c >= '\192') front) in (Pos sl sc sl (sc + frontlen), Pos sl (sc + frontlen + 1) el ec) in Many (ils' Seq.|> Node pos1 attr' (Str front) Seq.|> Node pos2 attr (Str lastword)) ils' Seq.:> Node pos attr' il -> Many (ils' Seq.|> Node pos (attr' <> attr) il) pInline' :: P Inlines pInline' = do (do c <- lookahead (satisfyByte isSpecial) fails pCloser (case c of '\\' -> pEscaped '[' -> pFootnoteReference <|> pLinkOrSpan '<' -> pAutolink '!' -> pImage '_' -> pEmph '*' -> pStrong '^' -> pSuperscript '~' -> pSubscript '{' -> pEmph <|> pStrong <|> pHighlight <|> pInsert <|> pDelete <|> pSuperscript <|> pSubscript <|> pDoubleQuote <|> pSingleQuote <|> (mempty <$ pAttributes) '`' -> pVerbatim ':' -> pSymbol '$' -> pMath '"' -> pDoubleQuote '\'' -> pSingleQuote '-' -> pHyphens '.' -> pEllipses '\n' -> pSoftBreak _ -> mzero) <|> pSpecial ) <|> pWords pSpecial :: P Inlines pSpecial = do st <- getState c <- satisfyByte (case mode st of TableCellMode -> \d -> isSpecial d && d /= '|' _ -> isSpecial) if c == '\r' then pure mempty else pure $ str $ B8.singleton c pWords :: P Inlines pWords = str <$> byteStringOf (skipSome (skipSatisfyByte (not . isSpecial))) pEscaped :: P Inlines pEscaped = do asciiChar '\\' c <- satisfyByte (\d -> isAscii d && (isSymbol d || isPunctuation d || d == ' ' || d == '\t')) <|> ('\n' <$ endline) <|> pure '\\' case c of '\n' -> hardBreak <$ skipMany spaceOrTab _ | c == ' ' || c == '\t' -> pHardBreak <|> if c == ' ' then pure nonBreakingSpace else pure $ str "\\\t" _ -> pure $ str $ B8.singleton c pHardBreak :: P Inlines pHardBreak = do -- assumes we've parsed \ already skipMany spaceOrTab endline skipMany spaceOrTab pure hardBreak pSoftBreak :: P Inlines pSoftBreak = do endline skipMany spaceOrTab (mempty <$ eof) <|> pure softBreak pSymbol :: P Inlines pSymbol = do asciiChar ':' bs <- byteStringOf $ skipSome (skipSatisfyByte (\c -> c == '+' || c == '-' || c == '_' || (isAscii c && isAlphaNum c))) asciiChar ':' pure $ symbol bs pMath :: P Inlines pMath = do asciiChar '$' mathStyle <- (DisplayMath <$ asciiChar '$') <|> pure InlineMath verb <- pVerbatim case unMany verb of [Node pos attr (Verbatim bs)] -> pure $ Many $ Seq.singleton $ Node pos attr (Math mathStyle bs) _ -> pure $ (case mathStyle of DisplayMath -> str "$$" _ -> str "$") <> verb {-# INLINE bracesRequired #-} bracesRequired :: Char -> Bool bracesRequired '=' = True bracesRequired '+' = True bracesRequired '-' = True bracesRequired _ = False pCloser :: P () pCloser = do delims <- activeDelims <$> getState if Set.null delims then mzero else do openerHadBrace <- asum $ map (\(Delim hadBrace c) -> hadBrace <$ asciiChar c) (F.toList delims) mblastc <- peekBack let afterws = maybe True isWs mblastc when ( afterws || openerHadBrace ) $ asciiChar '}' pEmph, pStrong, pSuperscript, pSubscript :: P Inlines pEmph = pBetween '_' emph pStrong = pBetween '*' strong pSuperscript = pBetween '^' superscript pSubscript = pBetween '~' subscript pHighlight, pInsert, pDelete :: P Inlines pHighlight = pBetween '=' highlight pInsert = pBetween '+' insert pDelete = pBetween '-' delete pBetween :: Char -> (Inlines -> Inlines) -> P Inlines pBetween c constructor = do let starter leftBrace = do case leftBrace of False | bracesRequired c -> mzero | otherwise -> asciiChar c `notFollowedBy` (ws <|> asciiChar '}') True -> asciiChar c `notFollowedBy` asciiChar '}' let ender leftBrace = do mblastc <- peekBack let afterws = maybe True isWs mblastc asciiChar c if leftBrace then asciiChar '}' else guard (not afterws) `notFollowedBy` asciiChar '}' leftBrace <- (True <$ asciiChar '{') <|> pure False starterBs <- (if leftBrace then ("{" <>) else id) <$> byteStringOf (starter leftBrace) `notFollowedBy` pAttributes -- don't let *{.foo} start emphasis, for example oldActiveDelims <- activeDelims <$> getState updateState $ \st -> st{ activeDelims = Set.insert (Delim leftBrace c) (activeDelims st) } firstIl <- pInline <|> pBetween c constructor -- to allow stacked cases like '**hi**' restIls <- many pInline let ils = mconcat (firstIl:restIls) updateState $ \st -> st{ activeDelims = oldActiveDelims } (constructor ils <$ ender leftBrace) <|> pure (str starterBs <> ils) pTicks :: P Int pTicks = do sp <- getOffset skipSome (asciiChar '`') ep <- getOffset pure (ep - sp) pVerbatim :: P Inlines pVerbatim = do numticks <- pTicks let ender = pTicks >>= guard . (== numticks) let content = skipSome (skipSatisfyByte (\c -> c /= '`' && c /= '\\')) <|> (asciiChar '\\' <* anyChar) <|> (fails ender *> skipSome (asciiChar '`')) bs <- trimSpaces <$> byteStringOf (skipMany content) <* (ender <|> eof) (rawInline <$> pRawAttribute <*> pure bs) <|> pure (verbatim bs) -- Trim a leading space if first non-space character is `, -- and similarly for trailing space/last non-space. trimSpaces :: ByteString -> ByteString trimSpaces = trimSpaceFront . trimSpaceBack where trimSpaceFront bs = case B8.span (== ' ') bs of (a, b) | B8.take 1 b == "`" , not (B8.null a) -> B8.drop 1 bs _ -> bs trimSpaceBack bs = case B8.spanEnd (== ' ') bs of (a, b) | B8.takeEnd 1 a == "`" , not (B8.null b) -> B8.dropEnd 1 bs _ -> bs pRawAttribute :: P Format pRawAttribute = do byteString "{=" fmt <- Format <$> byteStringOf (skipMany (skipSatisfyByte (\c -> c /= '}' && not (isWs c)))) asciiChar '}' pure fmt pFootnoteReference :: P Inlines pFootnoteReference = do asciiChar '[' asciiChar '^' label <- byteStringOf $ skipMany $ skipSatisfyByte (\c -> c /= ']' && not (isWs c)) asciiChar ']' pure $ footnoteReference label -- returns Left with parsed content if no ] has been reached, otherwise Right -- with inner contents. pBracketed :: P (Either Inlines Inlines) pBracketed = do let starter = asciiChar '[' let ender = asciiChar ']' starterBs <- byteStringOf starter oldActiveDelims <- activeDelims <$> getState updateState $ \st -> st{ activeDelims = Set.insert (Delim False ']') (activeDelims st) } ils <- mconcat <$> many pInline updateState $ \st -> st{ activeDelims = oldActiveDelims } (Right ils <$ ender) <|> pure (Left (str starterBs <> ils)) pImage :: P Inlines pImage = do asciiChar '!' (res, raw) <- withByteString pBracketed case res of Left ils -> pure (str "!" <> ils) Right ils -> ((str "!" <>) <$> pAddAttributes (span_ ils)) <|> (image ils <$> (pDestination <|> pReference raw)) <|> pure (str "![" <> ils <> str "]") pAutolink :: P Inlines pAutolink = do asciiChar '<' res <- byteStringOf $ skipSome $ skipSatisfyByte (\c -> c /= '>' && c /= '<') asciiChar '>' let url = B8.filter (\c -> c /= '\n' && c /= '\r') res case B8.find (\c -> c == '@' || c == ':' || c == '.') url of Just '@' -> pure $ emailLink url Just _ -> pure $ urlLink url Nothing -> mzero pLinkOrSpan :: P Inlines pLinkOrSpan = do (res, raw) <- withByteString pBracketed case res of Left ils -> pure ils Right ils -> (span_ ils <$ lookahead (asciiChar '{')) <|> (link ils <$> (pDestination <|> pReference raw)) <|> pure (str "[" <> ils <> str "]") -- We allow balanced pairs of parens inside. pDestination :: P Target pDestination = do asciiChar '(' res <- byteStringOf $ pInBalancedParens 0 asciiChar ')' pure $ Direct (snd (handleEscapesAndNewlines res)) where handleEscapesAndNewlines = B8.foldl' go (False, mempty) go (esc, bs) '\n' = (esc, bs) go (esc, bs) '\r' = (esc, bs) go (True, bs) c = (False, bs `B8.snoc` c) go (False, bs) '\\' = (True, bs) go (False, bs) c = (False, bs `B8.snoc` c) pInBalancedParens :: Int -> P () pInBalancedParens nestlevel = (guard (nestlevel == 0) <* lookahead (asciiChar ')')) <|> do lev <- (nestlevel <$ (fails pCloser *> -- but see https://github.com/jgm/djot/discussions/247 skipSatisfyByte (\c -> c /= '(' && c /= ')' && c /= '\\'))) <|> (nestlevel <$ (asciiChar '\\' <* anyChar)) <|> ((nestlevel + 1) <$ asciiChar '(') <|> ((nestlevel - 1) <$ asciiChar ')') pInBalancedParens lev pReference :: ByteString -> P Target pReference rawDescription = do asciiChar '[' bs <- byteStringOf $ pAtMost 400 $ skipSatisfyByte (\c -> c /= '[' && c /= ']') asciiChar ']' let label = normalizeLabel $ if B.null bs then B.drop 1 $ B.dropEnd 1 $ B8.filter (/= '\n') rawDescription else bs pure $ Reference label pAtMost :: Int -> P () -> P () pAtMost n pa = optional_ (pa *> when (n > 0) (pAtMost ( n - 1 ) pa)) pOpenDoubleQuote :: P () pOpenDoubleQuote = do lbrace <- (True <$ asciiChar '{') <|> pure False asciiChar '"' rbrace <- (True <$ asciiChar '}') <|> pure False guard $ lbrace || not rbrace pCloseDoubleQuote :: P () pCloseDoubleQuote = do mblastc <- peekBack let whitespaceBefore = maybe True isWs mblastc lbrace <- (True <$ asciiChar '{') <|> pure False asciiChar '"' rbrace <- (True <$ asciiChar '}') <|> pure False whitespaceAfter <- (True <$ lookahead (skipSatisfyByte isWs)) <|> pure False guard $ not lbrace && (rbrace || not whitespaceBefore || whitespaceAfter) pDoubleQuote :: P Inlines pDoubleQuote = (do pOpenDoubleQuote contents <- mconcat <$> many (fails pCloseDoubleQuote *> pInline) (doubleQuoted contents <$ pCloseDoubleQuote) <|> pure (openDoubleQuote <> contents)) <|> (closeDoubleQuote <$ asciiChar '"') openDoubleQuote, closeDoubleQuote :: Inlines openDoubleQuote = str "\226\128\156" -- utf8 0x201C closeDoubleQuote = str "\226\128\157" -- utf8 0x201D pOpenSingleQuote :: P () pOpenSingleQuote = do lastc <- fromMaybe '\n' <$> peekBack let openContext = lastc == '\t' || lastc == '\r' || lastc == '\n' || lastc == ' ' || lastc == '"' || lastc == '\'' || lastc == '(' || lastc == '[' || lastc == '\0' lbrace <- (True <$ asciiChar '{') <|> pure False asciiChar '\'' rbrace <- (True <$ asciiChar '}') <|> pure False guard $ lbrace || (openContext && not rbrace) pCloseSingleQuote :: P () pCloseSingleQuote = do mblastc <- peekBack let whitespaceBefore = maybe True isWs mblastc lbrace <- (True <$ asciiChar '{') <|> pure False asciiChar '\'' rbrace <- (True <$ asciiChar '}') <|> pure False alphaNumAfter <- (True <$ lookahead (satisfy isAlphaNum)) <|> pure False guard $ not lbrace && (rbrace || not (whitespaceBefore || alphaNumAfter)) pSingleQuote :: P Inlines pSingleQuote = (do pOpenSingleQuote contents <- mconcat <$> many (fails pCloseSingleQuote *> pInline) (singleQuoted contents <$ pCloseSingleQuote) <|> pure (closeSingleQuote <> contents)) <|> (closeSingleQuote <$ (pCloseSingleQuote <|> asciiChar '\'')) closeSingleQuote :: Inlines closeSingleQuote = str "\226\128\153" -- utf8 0x2019 pHyphens :: P Inlines pHyphens = do numHyphens <- length <$> some hyphen pure $ str $ go numHyphens where emdash = "\226\128\148" -- utf8 0x2014 endash = "\226\128\147" -- utf8 0x2013 hyphen = asciiChar '-' `notFollowedBy` asciiChar '}' go 1 = "-" go n | n `mod` 3 == 0 = mconcat (replicate (n `Prelude.div` 3) emdash) | n `mod` 2 == 0 = mconcat (replicate (n `Prelude.div` 2) endash) | n `mod` 3 == 2 = mconcat (replicate (n `Prelude.div` 3) emdash) <> endash | n `mod` 3 == 1 = mconcat (replicate (n `Prelude.div` 3 - 1) emdash) <> endash <> endash | otherwise = emdash <> go (n - 3) pEllipses :: P Inlines pEllipses = str "\226\128\166" {- utf8 0x2026 -} <$ byteString "..." stripEndChunks :: Seq Chunk -> Seq Chunk stripEndChunks cs = case Seq.viewr cs of initial Seq.:> c -> initial Seq.|> c{ chunkBytes = B8.dropWhileEnd isWs (chunkBytes c) } _ -> cs djot-0.1.2.4/src/Djot/Options.hs0000644000000000000000000000131407346545000014462 0ustar0000000000000000{-# LANGUAGE StrictData #-} module Djot.Options ( ParseOptions(..) , RenderOptions(..) , SourcePosOption(..) ) where newtype ParseOptions = ParseOptions { sourcePositions :: SourcePosOption -- ^ Add attributes for source lines } deriving (Show) newtype RenderOptions = RenderOptions { preserveSoftBreaks :: Bool -- ^ Preserve soft breaks as in the source } deriving (Show) -- | Adding source positions for blocks adds almost no overhead to parsing. -- Adding source positions for inlines has a small penalty. For many purposes -- it is enough to have source lines for blocks, so we offer the option. data SourcePosOption = NoSourcePos | BlockSourcePos | AllSourcePos deriving (Show, Eq, Ord) djot-0.1.2.4/src/Djot/Parse.hs0000644000000000000000000002710007346545000014102 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BinaryLiterals #-} module Djot.Parse ( Parser , Chunk(..) , parse , asciiChar , satisfyByte , skipSatisfyByte , satisfy , anyChar , skipMany , skipSome , eof , getState , updateState , lookahead , peek , peekBack , fails , failed , withByteString , byteStringOf , notFollowedBy , optional_ , byteString , getOffset , sourceLine , sourceColumn , branch , endline , restOfLine , ws , followedByWhitespace , followedByBlankLine , spaceOrTab , isWs , strToUtf8 , utf8ToStr ) where import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Data.ByteString (ByteString) import Control.Applicative import Control.Monad (void, MonadPlus(..)) import Data.Char (chr) import Data.Bits import Data.Maybe (fromMaybe) import Data.Text.Encoding (decodeUtf8With, encodeUtf8) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.Text as T -- import Text.Printf -- import Debug.Trace newtype Parser s a = Parser{ runParser :: ParserState s -> Maybe (ParserState s, a) } instance Functor (Parser s) where fmap f g = Parser $ \s -> case runParser g s of Nothing -> Nothing Just (s', !x) -> Just (s', f x) instance Applicative (Parser s) where pure x = Parser (\s -> Just (s, x)) liftA2 f g h = Parser $ \s -> case runParser g s of Nothing -> Nothing Just (s', x) -> case runParser h s' of Nothing -> Nothing Just (s'', y) -> Just (s'', f x y) instance Monad (Parser s) where return = pure f >>= g = Parser $ \s -> case runParser f s of Nothing -> Nothing Just (s', x) -> runParser (g x) s' instance Alternative (Parser s) where empty = Parser (const Nothing) f <|> g = Parser $ \s -> case runParser f s of Just (s', x) -> Just (s', x) Nothing -> runParser g s instance MonadPlus (Parser s) where mzero = empty mplus = (<|>) data Chunk = Chunk{ chunkLine :: Int , chunkColumn :: Int , chunkBytes :: ByteString } deriving (Show, Eq, Ord) data ParserState a = ParserState { chunks :: [Chunk] , subject :: !ByteString , offset :: !Int , line :: !Int , column :: !Int , userState :: !a } deriving (Show) -- | Apply a parser to a bytestring with a given user state. -- Returns @Nothing@ on failure, @Just result@ on success. parse :: Parser s a -> s -> [Chunk] -> Maybe a parse parser ustate chunks'' = snd <$> runParser parser ParserState { chunks = chunks' , subject = bs , offset = 0 , line = startline , column = startcol , userState = ustate } where (chunks', bs, startline, startcol) = case chunks'' of [] -> ([], mempty, 1, 0) (c:cs) -> (cs, chunkBytes c, chunkLine c, chunkColumn c) -- | Given a number of bytes, advances the offset and updates line/column. unsafeAdvance :: Int -> ParserState s -> ParserState s unsafeAdvance 0 = id unsafeAdvance !n = unsafeAdvance (n - 1) . unsafeAdvanceByte -- | Advance the offset and line/column for consuming a given byte. unsafeAdvanceByte :: ParserState s -> ParserState s unsafeAdvanceByte st | offset st + 1 >= B.length (subject st) , c:cs <- chunks st = st{ chunks = cs , subject = chunkBytes c , offset = 0 , line = chunkLine c , column = chunkColumn c } | otherwise = case B.index (subject st) (offset st) of 10 -> st{ offset = offset st + 1 , line = line st + 1 , column = 1 } 9 -> st{ offset = offset st + 1 , column = column st + (4 - (column st `mod` 4)) } !w | w < 0x80 -> st{ offset = offset st + 1 , column = column st + 1 } -- utf8 multibyte: only count byte 1: | w >= 0b11000000 -> st{ offset = offset st + 1 , column = column st + 1 } | otherwise -> st{ offset = offset st + 1 } -- | Returns current byte as Char. current :: ParserState s -> Maybe Char current st = subject st B8.!? offset st -- | Returns current byte as Char. peek :: Parser s (Maybe Char) peek = Parser $ \st -> Just (st, current st) -- | Returns previous byte as Char. Doesn't cross chunk boundaries. peekBack :: Parser s (Maybe Char) peekBack = Parser $ \st -> Just (st, subject st B8.!? (offset st - 1)) -- | Parse a byte satisfying a predicate. satisfyByte :: (Char -> Bool) -> Parser s Char satisfyByte f = Parser $ \st -> case current st of Just c | f c -> Just (unsafeAdvanceByte st, c) _ -> Nothing -- | Skip byte satisfying a predicate. skipSatisfyByte :: (Char -> Bool) -> Parser s () skipSatisfyByte f = Parser $ \st -> case current st of Just c | f c -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Parse a (possibly multibyte) Char satisfying a predicate. -- Assumes UTF-8 encoding. satisfy :: (Char -> Bool) -> Parser s Char satisfy f = Parser $ \st -> let peekWord !n = subject st B.!? (offset st + n) b2 = fromMaybe 0 $ peekWord 1 b3 = fromMaybe 0 $ peekWord 2 b4 = fromMaybe 0 $ peekWord 3 in case peekWord 0 of Nothing -> Nothing Just b1 | b1 < 0b10000000 , !c <- chr (fromIntegral b1) , f c -> Just (unsafeAdvanceByte st, c) | b1 .&. 0b11100000 == 0b11000000 , b2 >= 0b10000000 , !c <- chr (toCodePoint2 b1 b2) , f c -> Just (unsafeAdvance 2 st, c) | b1 .&. 0b11110000 == 0b11100000 , b2 >= 0b10000000 , b3 >= 0b10000000 , !c <- chr (toCodePoint3 b1 b2 b3) , f c -> Just (unsafeAdvance 3 st, c) | b1 .&. 0b11111000 == 0b11110000 , b2 >= 0b10000000 , b3 >= 0b10000000 , b4 >= 0b10000000 , !c <- chr (toCodePoint4 b1 b2 b3 b4) , f c -> Just (unsafeAdvance 4 st, c) | otherwise -> Nothing where toCodePoint2 a b = (fromIntegral (a .&. 0b00011111) `shiftL` 6) + fromIntegral (b .&. 0b00111111) toCodePoint3 a b c = (fromIntegral (a .&. 0b00001111) `shiftL` 12) + (fromIntegral (b .&. 0b00111111) `shiftL` 6) + fromIntegral (c .&. 0b00111111) toCodePoint4 a b c d = (fromIntegral (a .&. 0b00000111) `shiftL` 18) + (fromIntegral (b .&. 0b00111111) `shiftL` 12) + (fromIntegral (c .&. 0b00111111) `shiftL` 6) + fromIntegral (d .&. 0b00111111) -- | Parse any character. Assumes UTF-8 encoding. anyChar :: Parser s Char anyChar = satisfy (const True) -- | Parse an ASCII character. asciiChar :: Char -> Parser s () asciiChar !c = Parser $ \st -> case current st of Just d | d == c -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Apply parser 0 or more times, discarding result. skipMany :: Parser s a -> Parser s () skipMany parser = Parser go where go st = case runParser parser st of Nothing -> Just (st, ()) Just (st',_) -> go st' -- | Apply parser 1 or more times, discarding result. skipSome :: Parser s a -> Parser s () skipSome parser = parser *> skipMany parser -- | Succeeds if no more input. eof :: Parser s () eof = Parser $ \st -> case current st of Nothing -> Just (st, ()) Just _ -> Nothing -- | Returns current user state. getState :: Parser s s getState = Parser $ \st -> Just (st, userState st) -- | Updates user state. updateState :: (s -> s) -> Parser s () updateState f = Parser $ \st -> Just (st{ userState = f (userState st) }, ()) -- | Apply a parser, returning its result but not changing state -- or advancing. lookahead :: Parser s a -> Parser s a lookahead pa = Parser $ \st -> case runParser pa st of Just (_, x) -> Just (st, x) Nothing -> Nothing -- | Succeeds if parser fails. fails :: Parser s a -> Parser s () fails pa = Parser $ \st -> case runParser pa st of Just _ -> Nothing Nothing -> Just (st, ()) -- | Always fails. failed :: Parser s a failed = Parser $ const Nothing -- | Returns result of parse together with the bytestring -- consumed. withByteString :: Parser s a -> Parser s (a, ByteString) withByteString pa = Parser $ \st -> case runParser pa st of Just (st', x) -> Just (st', (x, B8.take (offset st' - offset st) (B8.drop (offset st) (subject st)))) Nothing -> Nothing -- | Returns bytestring consumed by parse. byteStringOf :: Parser s a -> Parser s ByteString byteStringOf pa = Parser $ \st -> case runParser pa st of Just (st', _) -> Just (st', case length (chunks st) - length (chunks st') of 0 -> B8.take (offset st' - offset st) (B8.drop (offset st) (subject st)) n -> B8.drop (offset st) (subject st) <> foldMap chunkBytes (take (n - 1) (chunks st)) <> B8.take (offset st') (subject st')) Nothing -> Nothing -- | Succeeds if first parser succeeds and second fails, returning -- first parser's value. notFollowedBy :: Parser s a -> Parser s b -> Parser s a notFollowedBy pa pb = pa <* fails pb -- | Apply parser but still succeed if it doesn't succeed. optional_ :: Parser s a -> Parser s () optional_ pa = void pa <|> pure () -- | Parse a bytestring. byteString :: ByteString -> Parser s () byteString bs = Parser $ \st -> if bs `B8.isPrefixOf` B8.drop (offset st) (subject st) then Just (unsafeAdvance (B.length bs) st, ()) else Nothing -- | Returns byte offset in input. getOffset :: Parser s Int getOffset = Parser $ \st -> Just (st, offset st) -- | Returns the line number. sourceLine :: Parser s Int sourceLine = Parser $ \st -> Just (st, line st) -- | Returns the source column number. (Tab stop is computed at 4.) sourceColumn :: Parser st Int sourceColumn = Parser $ \st -> Just (st, column st) -- | Try the first parser: if it succeeds, apply the second, -- returning its result, otherwise the third. branch :: Parser s b -> Parser s a -> Parser s a -> Parser s a branch pa pb pc = Parser $ \st -> case runParser pa st of Just (st',_) -> runParser pb st' Nothing -> runParser pc st -- | Parse an end of line sequence. endline :: Parser s () endline = branch (asciiChar '\r') (optional_ (asciiChar '\n')) (asciiChar '\n') -- | Return the rest of line (including the end of line). restOfLine :: Parser s ByteString restOfLine = byteStringOf $ skipMany (skipSatisfyByte (\c -> c /= '\n' && c /= '\r')) <* optional_ endline {-# INLINE isWs #-} -- | Is space, tab, `\r`, or `\n`. isWs :: Char -> Bool isWs c = c == ' ' || c == '\t' || c == '\r' || c == '\n' -- | Skip one space or tab. spaceOrTab :: Parser s () spaceOrTab = Parser $ \st -> case current st of Just ' ' -> Just (unsafeAdvanceByte st, ()) Just '\t' -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Skip 1 or more ASCII whitespace. ws :: Parser s () ws = skipSome (satisfyByte isWs) -- | Next character is ASCII whitespace. followedByWhitespace :: Parser s () followedByWhitespace = Parser $ \st -> case current st of Just c | isWs c -> Just (st, ()) _ -> Nothing -- | Followed by 0 or more spaces/tabs and endline or eof. followedByBlankLine :: Parser s () followedByBlankLine = Parser $ \st -> let subj = subject st !len = B8.length subj go !off | off >= len = Just (st, ()) | otherwise = case B8.index subj off of ' ' -> go (off + 1) '\r' -> go (off + 1) '\t' -> go (off + 1) '\n' -> Just (st, ()) _ -> Nothing in go (offset st) strToUtf8 :: String -> ByteString strToUtf8 = encodeUtf8 . T.pack utf8ToStr :: ByteString -> String utf8ToStr = T.unpack . decodeUtf8With lenientDecode djot-0.1.2.4/test/0000755000000000000000000000000007346545000011764 5ustar0000000000000000djot-0.1.2.4/test/Main.hs0000644000000000000000000001502207346545000013204 0ustar0000000000000000{-# LANGUAGE TupleSections #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} import Test.Tasty import Test.Tasty.QuickCheck import Test.Tasty.HUnit import qualified Data.Text.Lazy as TL import Data.Text.Lazy.Encoding (decodeUtf8With, encodeUtf8) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.ByteString as B import qualified Data.ByteString.Lazy.Char8 as BL import Data.ByteString.Builder ( toLazyByteString ) import Djot ( ParseOptions(..), RenderOptions(..), SourcePosOption(..), parseDoc, renderHtml, renderDjot ) import Djot.Parse ( parse, satisfy, strToUtf8, utf8ToStr, Chunk(..) ) import Djot.AST import System.FilePath ((), takeExtension, takeFileName) import System.Directory (getDirectoryContents) import Text.DocLayout (render) main :: IO () main = do specTests <- filter ((== ".test") . takeExtension) <$> getDirectoryContents "test" tests <- mapM (\fp -> (fp,) <$> getSpecTests ("test" fp)) specTests let parser = parseDoc ParseOptions{ sourcePositions = NoSourcePos } . BL.toStrict defaultMain $ testGroup "Tests" $ [ testGroup "djot -> html" (map (\(fp, ts) -> testGroup fp (map (toSpecTest parser) ts)) tests) , testGroup "native -> djot -> native" [testGroup fp (map (toRoundTripTest parser) ts) | (fp, ts) <- tests , takeFileName fp /= "raw.test"] , testGroup "Djot.Parse" parserTests , testGroup "sourcepos" sourcePosTests , testGroup "Fuzz" [testProperty "parses all inputs" (\s -> case parseDoc ParseOptions{ sourcePositions = NoSourcePos } (strToUtf8 s) of Left _ -> False Right _ -> True) ] ] parserTests :: [TestTree] parserTests = [ testCase "satisfy multibyte" (parse (satisfy (=='ǎ') *> satisfy (=='老')) () (toChunks $ strToUtf8 "ǎ老bc") @?= Just '老') , testProperty "UTF8 conversion round-trips" (\s -> utf8ToStr (strToUtf8 s) == s) ] sourcePosTests :: [TestTree] sourcePosTests = let convert = either mempty (fromUtf8 . toLazyByteString . renderHtml RenderOptions{ preserveSoftBreaks = True }) . parseDoc ParseOptions{ sourcePositions = AllSourcePos } in [ testCase "period at end" $ convert "the `goo` option.\n" @?= "

the goo option.

\n" , testCase "attr after *" $ convert "*{.foo}\n" @?= "

*

\n" , testCase "no newline at end" $ convert "foo" @?= "

foo

\n" , testCase "list" $ convert "1. > hello\nthere\n\n2. ok" @?= "
    \n
  1. \n
    \n

    hello\nthere

    \n
    \n
  2. \n
  3. \n

    ok

    \n
  4. \n
\n" , testCase "code block" $ convert "``` ruby\nhi\n```\n" @?= "
hi\n
\n" , testCase "nested " $ convert "*_hi_*" @?= "

hi

\n" , testCase "hr " $ convert "----\n" @?= "
\n" ] toChunks :: B.ByteString -> [Chunk] toChunks bs = [Chunk{ chunkBytes = bs, chunkLine = 1, chunkColumn = 0 }] toSpecTest :: (BL.ByteString -> Either String Doc) -> SpecTest -> TestTree toSpecTest parser st = testCase name (actual @?= expected) where name = "lines " ++ show (start_line st) ++ "-" ++ show (end_line st) expected = fromUtf8 $ html st ropts = RenderOptions{ preserveSoftBreaks = True } actual = either mempty (fromUtf8 . toLazyByteString . renderHtml ropts) . parser $ djot st toRoundTripTest :: (BL.ByteString -> Either String Doc) -> SpecTest -> TestTree toRoundTripTest parser st = testCase name ((actual == expected) @? rtlog) where name = "lines " ++ show (start_line st) ++ "-" ++ show (end_line st) native = either (\_ -> mempty) id $ parser (djot st) expected = native ropts = RenderOptions{ preserveSoftBreaks = True } renderedDjot = encodeUtf8 . TL.fromStrict $ render (Just 62) $ renderDjot ropts native actual = either (\_ -> mempty) id $ parser renderedDjot lbsToStr = TL.unpack . fromUtf8 rtlog = lbsToStr (djot st) <> "↓\n" <> show native <> "\n" <> "↓\n" <> lbsToStr renderedDjot <> "↓\n" <> show actual <> "\n" data SpecTest = SpecTest { djot :: BL.ByteString , source :: FilePath , end_line :: Int , start_line :: Int , html :: BL.ByteString } deriving (Show) getSpecTests :: FilePath -> IO [SpecTest] getSpecTests fp = do speclines <- zip [1..] . BL.lines <$> BL.readFile fp pure $ parseSpecTests fp speclines --- state machine parser for spec test cases data ParseState = Scanning | ParsingDjot (SpecTest, BL.ByteString) | ParsingHtml (SpecTest, BL.ByteString) deriving (Show) parseSpecTests :: FilePath -> [(Int, BL.ByteString)] -> [SpecTest] parseSpecTests fp = go Scanning where go _ [] = [] go Scanning ((ln, bs) : xs) | BL.length bs > 0 && BL.all (== '`') bs = go (ParsingDjot (SpecTest { djot = mempty , source = fp , end_line = ln , start_line = ln , html = mempty }, bs)) xs | otherwise = go Scanning xs go (ParsingDjot (st,fence)) ((_,bs) : xs) | bs == "." = go (ParsingHtml (st, fence)) xs | otherwise = go (ParsingDjot (st{ djot = djot st <> bs <> "\n" }, fence)) xs go (ParsingHtml (st,fence)) ((ln,bs) : xs) | bs == fence = st{ end_line = ln } : go Scanning xs | otherwise = go (ParsingHtml (st{ html = html st <> bs <> "\n" }, fence)) xs fromUtf8 :: BL.ByteString -> TL.Text fromUtf8 = decodeUtf8With lenientDecode djot-0.1.2.4/test/attributes.test0000644000000000000000000000774007346545000015063 0ustar0000000000000000An inline attribute attaches to the preceding element, which might be complex (span, emphasis, link) or a simple word (defined as a sequence of non-ASCII-whitespace characters). ``` foo привет{.ru} .

foo привет

``` ``` (some text){.attr} .

(some text)

``` ``` [some text]{.attr} .

some text

``` Ensure that emphasis that starts before the attribute can still close, even if the attribute contains a potential closer. ``` a *b{#id key="*"}* .

a b

``` ``` a *b{#id key="*"}o .

a *bo

``` Don't mind braces in quotes: ``` hi{key="{#hi"} .

hi

``` Process escapes correctly: ``` hi{key="\\\\\*"} {key="\\\\\*"} foo .

hi

foo

``` Don't allow attributes to start when we're parsing a potential attribute. ``` hi\{key="abc{#hi}" .

hi{key=“abc

``` ``` hi{key="\"#hi"} .

hi

``` ``` hi{key="hi\"#hi"} .

hi

``` Line break: ``` hi{#id .class key="value"} .

hi

``` Here there is nothing for the attribute to attach to: ``` {#id} at beginning .

at beginning

``` ``` After {#id} space {.class} .

After space

``` Block attributes come before the block, on a line by themselves. ``` {#id .class} A paragraph .

A paragraph

``` Use indentation if you need to continue the attributes over a line break. ``` {#id .class style="color:red"} A paragraph .

A paragraph

``` If the attribute block can't be parsed as attributes, it will be parsed as a regular paragraph: ``` {#id .cla*ss* .

{#id .class

``` You can use consecutive attribute blocks. In case of conflict, later values take precedence over earlier ones, but classes accumulate: ``` {#id} {key=val} {.foo .bar} {key=val2} {.baz} {#id2} Okay .

Okay

``` Attributes on different kinds of blocks: ``` {#id} > Block quote .

Block quote

``` ``` {#id} # Heading .

Heading

``` ``` {.blue} - - - - - .
``` ```` {highlight=3} ``` ruby x = 3 ``` .
x = 3
```` ``` {.special} 1. one 2. two .
  1. one
  2. two
``` ``` > {.foo} > > {.bar} > > nested .

nested

``` Comments start at a `%` character (not in quotes) and end with another `%` or the end of the attribute (`}`). These can be used to comment up an attribute list or without any real attributes. ``` foo{#ident % this is a comment % .class} .

foo

``` ``` foo{#ident % this is a comment} .

foo

``` In block-level comment, subsequent lines must be indented, as with attributes: ``` {% This is a comment before a block-level item. %} Paragraph. .

Paragraph.

``` Inline attributes can be empty: ``` hi{} .

hi

``` Block attributes can be empty: ``` {} hi .

hi

``` Non-attributes: ``` text{a=x hello .

text{a=x

hello

``` skip ``` skip {a=x skip hello skip . skip

{a=x skip hello

skip ``` ``` text{a=x # non-heading .

text{a=x # non-heading

``` skip ``` skip {a=x skip # non-heading skip . skip

{a=x skip # non-heading

skip ``` ``` {a=" inline text .

{a=“ inline text

``` ``` { attr="long value spanning multiple lines" } > a .

a

``` ``` > {key="bar > a\$bim"} > ou .

ou

``` djot-0.1.2.4/test/block_quote.test0000644000000000000000000000243307346545000015176 0ustar0000000000000000``` > Basic > block _quote_. .

Basic block quote.

``` ``` > Lazy block _quote_. .

Lazy block quote.

``` ``` > block > > quote .

block

quote

``` ``` > block > quote .

block

quote

``` ``` > > > nested .

nested

``` ``` > > > nested lazy .

nested lazy

``` ``` > > > nested > lazy .

nested lazy

``` ``` > nested > > > more .

nested

more

``` ``` >not blockquote .

>not blockquote

``` ``` >> not blockquote .

>> not blockquote

``` ``` > .
``` ``` > # Heading .

Heading

``` ``` > hi >there .

hi >there

``` ``` aaa > bbb .

aaa > bbb

``` ``` aaa > bbb .

aaa

bbb

``` djot-0.1.2.4/test/code_blocks.test0000644000000000000000000000112007346545000015126 0ustar0000000000000000``` ``` code block ``` .
code
  block
``` ```` ``` python x = y + 3 ``` .
x = y + 3
```` ```` ``` python if true: x = 3 ``` .
if true:
  x = 3
```` ```` ``` not a code block ``` .

not a code block

```` ```` ``` not a code block .

not a code block

```` ```` ``` hi ``` ``` two ``` .
hi
two
```` Empty code block: ```` ``` ``` .
```` djot-0.1.2.4/test/definition_lists.test0000644000000000000000000000152307346545000016234 0ustar0000000000000000Definition lists are just like ordinary bullet lists, but with `:` as the marker instead of `-`, `+`, or `*`. The first paragraph of the list item is interpreted as the term, and the rest as the definition. ``` : apple red fruit : banana yellow fruit .
apple
red fruit
banana
yellow fruit
``` Loose: ``` : apple red fruit : banana yellow fruit .
apple

red fruit

banana

yellow fruit

``` ``` : apple fruit Paragraph one Paragraph two - sub - list : orange .
apple fruit

Paragraph one

Paragraph two

  • sub
  • list
orange
``` ```` : ``` ok ``` .
ok
```` djot-0.1.2.4/test/emphasis.test0000644000000000000000000000427707346545000014510 0ustar0000000000000000``` *foo bar* .

foo bar

``` ``` a* foo bar* .

a* foo bar*

``` ``` *foo bar * .

*foo bar *

``` Unicode spaces don't block emphasis. ``` * a * .

 a 

``` Intraword: ``` foo*bar*baz .

foobarbaz

``` ``` _foo bar_ .

foo bar

``` ``` _ foo bar_ .

_ foo bar_

``` ``` _foo bar _ .

_foo bar _

``` Unicode spaces don't block emphasis. ``` _ a _ .

 a 

``` Intraword: ``` foo_bar_baz .

foobarbaz

``` ``` aa_"bb"_cc .

aa“bb”cc

``` ``` *foo_ .

*foo_

``` ``` _foo* .

_foo*

``` A line ending counts as whitespace: ``` _foo bar _ .

_foo bar _

``` So does a tab: ``` _ a_ .

_ a_

``` This one is different from commonmark: ``` _(_foo_)_ .

(foo)

``` But you can force the second `_` to be an opener using the marker `{`. ``` _({_foo_})_ .

(foo)

``` Note that an explicitly marked opener can only be closed by an explicitly marked closer, and a non-marked opener can only be closed by a non-marked closer: ``` {_ x_ _} _x_} .

x_ _x_}

``` ``` _(*foo*)_ .

(foo)

``` Overlapping scopes (first to close wins): ``` _foo *bar_ baz* .

foo *bar baz*

``` Over line break: ``` _foo bar_ .

foo bar

``` Inline content allowed: ``` *foo [link](url) `*`* .

foo link *

``` Can't emph an underscore: ``` ___ .

___

``` Unless you escape it: ``` _\__ .

_

``` No empty emph: ``` __ .

__

``` ``` _}b_ .

_}b_

``` ``` _\}b_ .

}b

``` ``` _ab\_c_ .

ab_c

``` ``` *****a***** .

a

``` ``` _[bar_](url) .

[bar](url)

``` ``` \_[bar_](url) .

_bar_

``` Code takes precedence: ``` _`a_`b .

_a_b

``` Autolinks take precedence: ``` _ .

_http://example.com/a_b

``` djot-0.1.2.4/test/escapes.test0000644000000000000000000000102007346545000014301 0ustar0000000000000000ASCII punctuation characters can be escaped: ``` \`\*\_\[\# .

`*_[#

``` Non-ASCII punctuation characters can't be escaped: ``` \a\« .

\a\«

``` An escaped newline is a hard break: ``` ab\ c .

ab
c

``` There can be spaces and tabs between the backslash and the newline: ``` ab\ c .

ab
c

``` There can also be spaces and tabs before the backslash, which are ignored: ``` ab \ c .

ab
c

``` An escaped space is a non-breaking space: ``` a\ b .

a b

``` djot-0.1.2.4/test/fenced_divs.test0000644000000000000000000000345507346545000015145 0ustar0000000000000000Fenced divs are containers for sequences of blocks, to which an attribute can be attached. A fenced div begins with an opening fence: a line with three or more consecutive `:` characters, followed optionally by a class name and optionally whitespace. It ends with a closing fence: a line beginning with three or more consecutive `:` characters, followed by optional whitespace and the end of the line. The number of `:` characters in the closing fence must be at least the number in the opening fence. If the end of the input (or enclosing block) is encountered before a closing fence, the fenced div is implicitly closed. ``` :::::::::: foo Hi > A block quote. ::::::::::: .

Hi

A block quote.

``` ``` {#bar .foo} ::: Hi > A block quote. ::::::::::::: .

Hi

A block quote.

``` Fenced divs may be nested. ``` {#bar .foo} :::: Hi ::: baz > A block quote. ::: :::: .

Hi

A block quote.

``` A fenced div cannot interrupt a paragraph, without an intervening blank line. ``` Paragraph text :::: Hi :::: .

Paragraph text :::: Hi ::::

``` A fenced div need not have attributes or a class name. ``` :::: Hi :::: .

Hi

``` The closing fence must be at least as long as the opening fence. ``` ::::::::: foo Hi :::: .

Hi ::::

``` If the end of the input (or enclosing block) is encountered before a closing fence, the fenced div is implicitly closed. ``` > :::: foo > Hi .

Hi

``` ```` ::: outer ``` ::: ``` ::: .
:::
```` djot-0.1.2.4/test/footnotes.test0000644000000000000000000000354107346545000014710 0ustar0000000000000000```` test[^a] and another[^foo_bar]. [^a]: This is a note. Second paragraph. [^foo_bar]: ``` code ``` another ref to the first note[^a]. .

test1 and another2.

another ref to the first note1.


  1. This is a note.

    Second paragraph.↩︎

  2. code
    

    ↩︎

```` ``` test[^nonexistent] [^unused]: note more .

test1


  1. ↩︎

``` ``` [^a] [^b] [^b]: .

1 2


  1. ↩︎

  2. ↩︎

``` Issue #37: ``` text[^footnote]. [^footnote]: very long footnote[^another-footnote] [^another-footnote]: bla bla[^another-footnote] .

text1.


  1. very long footnote2↩︎

  2. bla bla2↩︎

``` djot-0.1.2.4/test/headings.test0000644000000000000000000000355307346545000014455 0ustar0000000000000000``` ## Heading .

Heading

``` ``` # Heading # another .

Heading

another

``` ``` # Heading # continued .

Heading continued

``` ``` ## heading para .

heading

para

``` ``` ## .

``` ``` ## Heading ### Next level .

Heading

Next level

``` ``` # Heading lazy .

Heading lazy

``` ``` # Heading lazy # more lazy text .

Heading lazy more lazy

text

``` ``` ##Notheading .

##Notheading

``` ``` ## Heading .

Heading

``` ``` ## heading ## .

heading ##

``` ``` # # heading .

# heading

``` Auto-identifiers: ``` {#Foo-bar} Paragraph # Foo bar ## Foo bar {#baz} # Foo bar .

Paragraph

Foo bar

Foo bar

Foo bar

``` Implicit header references: ``` See [Introduction][]. # Introduction .

See Introduction.

Introduction

``` ``` See [Introduction][]. {#foo} # Introduction .

See Introduction.

Introduction

``` ``` See [Introduction][]. # Introduction [Introduction]: #bar .

See Introduction.

Introduction

``` djot-0.1.2.4/test/insert_delete_mark.test0000644000000000000000000000071207346545000016525 0ustar0000000000000000``` This is {-deleted _text_-}. The braces are -required-. And they must be in the -}right order{-. .

This is deleted text. The braces are -required-. And they must be in the -}right order{-.

``` ``` {+ Inserted text +} .

Inserted text

``` Interaction with smart: ``` {--hello--} .

-hello-

``` ``` This is {=marked *text*=}. .

This is marked text.

``` djot-0.1.2.4/test/links_and_images.test0000644000000000000000000000555107346545000016162 0ustar0000000000000000``` [basic _link_][a_b_] [a_b_]: url .

basic link

``` ``` ![basic _image_][a_b_] [a_b_]: url .

basic image

``` ``` [link][] [link]: url .

link

``` ``` [link][] [link]: url .

link

``` The URL can be split over multiple lines: ``` [link][] [link]: url andurl .

link

``` ``` [link](url andurl) .

link

``` ``` [link][] [link]: [link2]: url .

link

``` ``` [link][] [link][link2] [link2]: url2 [link]: url .

link link

``` ``` [link][a and b] [a and b]: url .

link

``` If the reference isn't found, we get an empty link. ``` [link][a and b] .

link

``` Reference definitions can't have line breaks in the key: ``` [link][a and b] [a and b]: url .

link

[a and b]: url

``` No case normalization is done on reference definitions: ``` [Link][] [link]: /url .

Link

``` Attributes on reference definitions get transferred to the link: ``` {title=foo} [ref]: /url [ref][] .

ref

``` Attributes on the link override those on references: ``` {title=foo} [ref]: /url [ref][]{title=bar} .

ref

``` ``` [link _and_ link][] [link _and_ link]: url .

link and link

``` ``` ![basic _image_](url) .

basic image

``` ``` [![image](img.jpg)](url) .

image

``` ``` [unclosed](hello *a b* .

[unclosed](hello a b

``` Note that soft breaks are ignored, so long URLs can be split over multiple lines: ``` [closed](hello *a b*) .

closed

``` Here the strong takes precedence over the link because it starts first: ``` *[closed](hello*) .

[closed](hello)

``` Avoid this with a backslash escape: ``` *[closed](hello\*) .

*closed

``` Link in link? ``` [[foo](bar)](baz) .

foo

``` Link in image? ``` ![[link](url)](img) .

link

``` Image in link? ``` [![image](img)](url) .

image

``` Autolinks: ``` .

http://example.com/foo me@example.com

``` Openers inside `[..](` or `[..][` or `[..]{` can't match outside them, even if the construction doesn't turn out to be a link or span or image. ``` [x_y](x_y) .

x_y

``` ``` [x_y](x_ .

[x_y](x_

``` ``` [x_y]{.bar_} .

x_y

``` djot-0.1.2.4/test/lists.test0000644000000000000000000000516607346545000014033 0ustar0000000000000000``` - one - two .
  • one
  • two
``` ``` - one - two - three .
  • one - two - three
``` ``` - one - two - three .
  • one
    • two
      • three
``` ``` - one and another paragraph - a list - two .
  • one and

    another paragraph

    • a list
  • two

``` ``` - one lazy - two .
  • one lazy
  • two
``` ``` - a - b + c .
  • a
  • b
  • c
``` ``` - a - b .
  • a

  • b

``` ``` - a - b - c - d .
  • a - b
    • c
  • d
``` ``` - a - b - c - d .
  • a - b
    • c
  • d
``` ``` - a b - c .
  • a

    b

  • c

``` ``` - a - b - c - d .
  • a
    • b
    • c
  • d
``` ``` - a - b - c - d .
  • a
    • b
    • c
  • d
``` ``` - a * b cd .
  • a
    • b cd
``` ``` - - - a .
      • a
``` ``` 1. one 1. two .
  1. one
  2. two
``` ``` 1. one 1. two .
  1. one
    1. two
``` ``` 4. one 5. two .
  1. one
  2. two
``` ``` 1) one 2) two .
  1. one
  2. two
``` ``` (1) one (2) two .
  1. one
  2. two
``` ``` (a) one (b) two .
  1. one
  2. two
``` ``` (D) one (E) two .
  1. one
  2. two
``` ``` a. one b. two .
  1. one
  2. two
``` ``` i. one ii. two .
  1. one
  2. two
``` ``` xli) one xlii) two .
  1. one
  2. two
``` ``` (IV) one (V) two .
  1. one
  2. two
``` ``` i. a ii. b .
  1. a
  2. b
``` ``` i. a j. b .
  1. a
  2. b
``` When ambiguous, prioritize roman numerals. ``` i. a i. b .
  1. a
  2. b
``` ``` I. a II. b E. d .
  1. a
  2. b
  1. d
``` ``` The civil war ended in 1865. And this should not start a list. .

The civil war ended in 1865. And this should not start a list.

``` djot-0.1.2.4/test/math.test0000644000000000000000000000130307346545000013613 0ustar0000000000000000Math goes in verbatim spans prefixed with either `$` (for inline math) or `$$` (for display math). ``` $`e=mc^2` .

\(e=mc^2\)

``` ``` My equation: $`e=mc^2` .

My equation: \(e=mc^2\)

``` ``` $$`e=mc^2` .

\[e=mc^2\]

``` ``` My equation: $$`e=mc^2` .

My equation: \[e=mc^2\]

``` Newlines are allowed, just as in verbatim: ``` $`e= mc^2` .

\(e= mc^2\)

``` `$` characters are allowed inside: ``` $`e=\text{the number $\pi$}` .

\(e=\text{the number $\pi$}\)

``` djot-0.1.2.4/test/para.test0000644000000000000000000000005107346545000013604 0ustar0000000000000000``` hi there .

hi there

``` djot-0.1.2.4/test/raw.test0000644000000000000000000000060707346545000013461 0ustar0000000000000000Raw inline content: ``` ``{=html} .

``` Raw block-level content: ```` ``` =html ``` .
```` You can't mix regular attributes and raw syntax: ```` `foo`{=html #id} ``` .

<b>foo</b>{=html #id}

```` Attributes attached to raw content will just be ignored: ```` {.foo} ``` =html
``` .
```` djot-0.1.2.4/test/regression.test0000644000000000000000000000326107346545000015047 0ustar0000000000000000Issue #104: ``` {1--} {1-} .

{1--}

{1-}

``` Issue #106: ``` |`| .

||

``` ``` |`|x .

||x

``` Issue #127: ``` \$$`a` .

$\(a\)

``` ``` { .` .

{ .

``` Issue #57: ``` | 1 | 2 | ^ cap1 ^ cap2 .
cap2
1 2
``` Section start after list: ``` : term def # New heading .
term
def

New heading

``` Block quotes with markers unaligned: ``` > foo > bar .

foo bar

``` ``` {#convertibilità} # Convertibilità .

Convertibilità

``` Issue #4 ``` 1. one 2. two ok .
  1. one
  2. two

ok

``` ``` 1. one 2. two |three|four| five .
  1. one
  2. two
three four

five

``` ``` |one|two|three| |four|five|six| .
one two three
four five six
``` Issue jgm/djoths#10: ``` 1. Hello ::: hi inside list? ::: ::: hi inside list? ::: .
  1. Hello

    inside list?

    inside list?

``` Issue #11: ``` > ```js > { > "A": 1 > } > ``` .
{
  "A": 1
}
``` Issue #12: ``` I like the Lemon Jelly album titled '}64–'}95. .

I like the Lemon Jelly album titled ’64–’95.

``` djot-0.1.2.4/test/smart.test0000644000000000000000000000633207346545000014017 0ustar0000000000000000Open quotes are matched with closed quotes. The same method is used for matching openers and closers as is used in emphasis parsing: ``` "Hello," said the spider. "'Shelob' is my name." .

“Hello,” said the spider. “‘Shelob’ is my name.”

``` ``` 'A', 'B', and 'C' are letters. .

‘A’, ‘B’, and ‘C’ are letters.

``` ``` 'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' .

‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’

``` ``` 'He said, "I want to go."' .

‘He said, “I want to go.”’

``` A single quote that isn't an open quote matched with a close quote will be treated as an apostrophe: ``` Were you alive in the '70s? .

Were you alive in the ’70s?

``` ``` Here is some quoted '`code`' and a "[quoted link](url)". .

Here is some quoted ‘code’ and a “quoted link”.

``` Here the first `'` is treated as an apostrophe, not an open quote, because the final single quote is matched by the single quote before `jolly`: ``` 'tis the season to be 'jolly' .

’tis the season to be ‘jolly’

``` Multiple apostrophes should not be marked as open/closing quotes. ``` 'We'll use Jane's boat and John's truck,' Jenna said. .

‘We’ll use Jane’s boat and John’s truck,’ Jenna said.

``` An unmatched double quote will be interpreted as a left double quote, to facilitate this style: ``` "A paragraph with no closing quote. "Second paragraph by same speaker, in fiction." .

“A paragraph with no closing quote.

“Second paragraph by same speaker, in fiction.”

``` A quote following a `]` or `)` character cannot be an open quote: ``` [a]'s b' .

[a]’s b’

``` Quotes that are escaped come out as literal straight quotes: ``` \"This is not smart.\" This isn\'t either. 5\'8\" .

"This is not smart." This isn't either. 5'8"

``` Doubled quotes are treated as nested: ``` ''hi'' .

‘‘hi’’

``` Heuristics for determining openers and closers can be overridden using `{` and `}`: ``` {''}hi{''} .

‘’hi‘’

``` Two hyphens form an en-dash, three an em-dash. ``` Some dashes: em---em en--en em --- em en -- en 2--3 .

Some dashes: em—em en–en em — em en – en 2–3

``` A sequence of more than three hyphens is parsed as a sequence of em and/or en dashes, with no hyphens. If possible, a homogeneous sequence of dashes is used (so, 10 hyphens = 5 en dashes, and 9 hyphens = 3 em dashes). When a heterogeneous sequence must be used, the em dashes come first, followed by the en dashes, and as few en dashes as possible are used (so, 7 hyphens = 2 em dashes an 1 en dash). ``` one- two-- three--- four---- five----- six------ seven------- eight-------- nine--------- thirteen-------------. .

one- two– three— four–– five—– six—— seven—–– eight–––– nine——— thirteen———––.

``` Hyphens can be escaped: ``` Escaped hyphens: \-- \-\-\-. .

Escaped hyphens: -- ---.

``` Three periods form an ellipsis: ``` Ellipses...and...and.... .

Ellipses…and…and….

``` Periods can be escaped if ellipsis-formation is not wanted: ``` No ellipses\.\.\. .

No ellipses...

``` djot-0.1.2.4/test/spans.test0000644000000000000000000000043507346545000014013 0ustar0000000000000000``` This is a [test of *color*]{.blue}. .

This is a test of color.

``` ``` not a [span] {#id}. .

not a [span] .

``` ``` [nested [span]{.blue}]{#ident} .

nested span

``` djot-0.1.2.4/test/super_subscript.test0000644000000000000000000000035007346545000016117 0ustar0000000000000000``` H~2~O .

H2O

``` ``` mc^2^ .

mc2

``` ``` test^of superscript ~with subscript~^ .

testof superscript with subscript

``` ``` H{~2 ~}O .

H2 O

``` djot-0.1.2.4/test/symb.test0000644000000000000000000000025507346545000013641 0ustar0000000000000000``` :+1: :scream: .

:+1: :scream:

``` ``` :ice:scream: .

:ice:scream:

``` djot-0.1.2.4/test/tables.test0000644000000000000000000000277407346545000014151 0ustar0000000000000000Simplest table: ``` | a | .
a
``` ``` |a| *b*| |*c| d* | .
a b
*c d*
``` ``` | `a |` .

| a |

``` ``` | a | b | ^ With a _caption_ and another line. .
With a caption and another line.
a b
``` Table headers: note that we can have multiple headers; each determines the alignment for following cells, until the next header. ``` |a|b| |:-|---:| |c|d| |cc|dd| |-:|:-:| |e|f| |g|h| .
a b
c d
cc dd
e f
g h
``` ``` |--|--| .
``` ``` |---|---| | a | b | .
a b
``` ``` | | .
``` ``` | just two \| `|` | cells in this table | .
just two | | cells in this table
``` Indented table: ``` | a | b | |---|---| | 1 | 2 | .
a b
1 2
``` djot-0.1.2.4/test/task_lists.test0000644000000000000000000000116207346545000015045 0ustar0000000000000000``` - [ ] an unchecked task list item - [x] checked item .
``` ``` * [ ] an unchecked task list item with two paragraphs * [x] checked item .
  • with two paragraphs

``` djot-0.1.2.4/test/thematic_breaks.test0000644000000000000000000000043607346545000016015 0ustar0000000000000000``` hello - - - there .

hello


there

``` ``` hello ** ** there .

hello


there

``` ``` hello *-*-*-* there .

hello


there

``` ``` hello *-*-*-* there .

hello --- there

``` djot-0.1.2.4/test/verbatim.test0000644000000000000000000000075507346545000014505 0ustar0000000000000000``` Some `code` .

Some code

``` ``` Some `code with a line break` .

Some code with a line break

``` ``` Special characters: `*hi*` .

Special characters: *hi*

``` ``` *foo`*` .

*foo*

``` ``````` `````a`a``a```a````a``````a````` .

a`a``a```a````a``````a

``````` ``` ` ``a`` ` .

``a``

``` Implicitly closed by end of paragraph: ``` ` a c .

a c

```