Skip to content

Commit

Permalink
Merge pull request #52 from JuliaDocs/mg/15-latexstrings
Browse files Browse the repository at this point in the history
Improve parsing of tex strings
  • Loading branch information
goerz authored Oct 25, 2023
2 parents 4365db2 + 8f83b48 commit 975a5a2
Show file tree
Hide file tree
Showing 14 changed files with 901 additions and 106 deletions.
2 changes: 1 addition & 1 deletion .typos.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[files]
extend-exclude = ["*.bib", "test_formatting.jl"]
extend-exclude = ["*.bib", "test_formatting.jl", "test_tex_to_markdown.jl"]
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Skip the expansion of citations and bibliographies when running in doctest mode [[#34][]]
* Support underscores in citation keys [[#14][]]
* The `Pages` in a `@bibliography` block are now relative to the folder containing the current file. The behavior is consistent with `Pages` in Documenter's `@index` and `@contents` blocks. [[#22][]]
* The parsing of LaTeX strings has improved significantly. In particular, curly braces should now be stripped correctly [[#15][]]. Note that that braces in titles are never needed for `DocumenterCitations`, but handling them correctly makes it easier to use the same `.bib` file for LaTeX and `DocumenterCitations`.

### Added

Expand All @@ -21,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Support for bibliographies in PDFs generate via LaTeX (`format=Documenter.LaTeX()`). Citations and references are rendered exactly as in the HTML version. Specifically, the support does not depend on `bibtex`/`biblatex` and supports any style (including custom styles). [[#18][]]
* Functions `DocumenterCitations.set_latex_options` and `DocumenterCitations.reset_latex_options` to tweak the rendering of bibliographies in PDFs.
* The `Pages` in a `@bibliography` block can now use `@__FILE__` to refer to the current file. [[#22][]]

* You may now use `\url` and `\href` commands in the `@misc` field of an entry.

### Internal Changes

Expand Down Expand Up @@ -128,5 +129,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[#19]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/19
[#18]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/18
[#16]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/16
[#15]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/15
[#14]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/14
[#6]: https://github.com/JuliaDocs/DocumenterCitations.jl/issues/6
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "1.2.1+dev"
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Bibliography = "f1be7e48-bf82-45af-a471-ae754a193061"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
MarkdownAST = "d0879d2d-cac2-40c8-9cee-1863dc0c7391"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Expand Down
47 changes: 47 additions & 0 deletions docs/latex/accents.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
\documentclass[aps,pra,onecolumn,noshowpacs,superscriptaddress,preprintnumbers,%
kamsmath,amssymb,notitlepage,letterpaper]{revtex4-2}

\def\Author{Michael Goerz}
\def\Title{Demo of Accented Characters}

\usepackage[utf8]{inputenc}
\usepackage{caption}
\captionsetup{justification=raggedright, singlelinecheck=true}
\usepackage[
pdftitle={\Title},
pdfauthor={\Author},
colorlinks=true, linkcolor=magenta, urlcolor=black, citecolor=magenta,
bookmarksopen=false, breaklinks=true, plainpages=false, pdfpagelabels
]{hyperref}

\begin{document}

\title{\Title}
\author{\Author}
\date{\today}

\maketitle

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{itemize}
\item a\oe b, a\oe b, a\oe{} b
\item \`a \'a
\item \"o, \"{o}, \"{oe}
\item M{\o}lmer
\item Fran\c{c}oise Fran{\c c}oise Fran\c{}coise
\item {\SS} and \ss.
\item {\i} and {\j}
\item {\"\i} and {\"\j}
\item escaped characters \% and zero-arg command \o behave differently.
\item x\d{o}x x\d{ooo}x x\d{}ox x\d ox x\d ooox
\item x\emph{word}x x\emph {word}x x\emph wordx x\emph{}wordx
\item x\href{http://www.google.com}{Google}x x\href {http://www.google.com} {Google}x
\item x\t{oo}x x\t{ou}x x\t{12}x x\t{abc}x
\item \i\j \"i \"j \"\i \"\j
\item \href{a}{\textit{a}\%x\textit{b}}
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\end{document}
5 changes: 4 additions & 1 deletion docs/src/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ @phdthesis{GoerzPhd2015
School = {Universität Kassel},
url = {https://kobra.uni-kassel.de/handle/123456789/2015052748381},
Year = {2015},
note = {See \url{https://michaelgoerz.net} for additional formats. Source available on \href{https://github.com/goerz/dissertation}{Github}},
}

@article{GoerzPRA2015,
Expand Down Expand Up @@ -495,6 +496,7 @@ @manual{SciPy
title = {{SciPy}: Open source scientific tools for {Python}},
year = {2001--},
url = {https://docs.scipy.org/doc/scipy/},
note = {Documentation available at \url{https://docs.scipy.org/doc/scipy/}},
}


Expand Down Expand Up @@ -672,6 +674,7 @@ @unpublished{TedRyd

@misc{jax,
author = {James Bradbury and Roy Frostig and Peter Hawkins and Matthew James Johnson and Chris Leary and Dougal Maclaurin and George Necula and Adam Paszke and Jake Vander{P}las and Skye Wanderman-{M}ilne and Qiao Zhang},
title = {JAX: composable transformations of Python+NumPy programs},
title = {\texttt{JAX}: composable transformations of Python+NumPy programs},
url = {https://github.com/google/jax},
note = {\url{https://numpy.org}},
}
12 changes: 7 additions & 5 deletions docs/src/syntax.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,18 +185,20 @@ In general, the citation style determines the order of the references, see the [

The [`refs.bib`](./refs.bib) file is in the standard [BibTeX format](https://www.bibtex.com/g/bibtex-format/). It must be parsable by [BibParser.jl](https://github.com/Humans-of-Julia/BibParser.jl).

You will find that you get the best results by maintaining a `.bib` files by hand, specifically for a given project using `DocumenterCitations`. A `.bib` file that works well with LaTeX may or may not work well with `DocumenterCitations`: remember that in LaTeX, the strings inside any BibTeX fields are rendered through the TeX engine. At least in principle, they may contain arbitrary macros.
You will find that you get the best results by maintaining a `.bib` files by hand, specifically for a given project using `DocumenterCitations`. A `.bib` file that works well with LaTeX will usually, but not always work well with `DocumenterCitations`: Remember that in LaTeX, the strings inside any BibTeX fields are rendered through the TeX engine. At least in principle, they may contain arbitrary macros.

In contrast, for `DocumenterCitations`, the BibTeX fields are minimally processed to convert some common LaTeX constructs to plain text, but beyond that, they are used "as-is". In future versions, the handling of LaTeX macros may improve, but it is best not to rely on it, and instead edit the `.bib` file so that it gives good results with `DocumenterCitations` (see the tips below).
In contrast, `DocumenterCitations` only parses a subset of LaTeX syntax and converts it to markdown. This includes the [commands for special symbols](https://www.bibtex.org/SpecialSymbols/) (although unicode is preferred) and a handful of commands like [`\url` and `\href`](https://www.overleaf.com/learn/latex/Hyperlinks#Linking_web_addresses). If you would like to have support added for a specific additional command, [please open an issue](https://github.com/JuliaDocs/DocumenterCitations.jl/issues/new/choose).
In addition, `DocumeterCitations` recognizes and preserves in-line math and removes braces.

While we try to be reasonably compatible, "Any `.bib` file will render the bibliography you expect" is not a design goal, but "It is possible to write a `.bib` file so that you get exactly the bibliography you want" is.
"Any `.bib` file will render the bibliography you expect" is not a design goal of `DocumenterCitations`: "It is possible to write a `.bib` file so that you get exactly the bibliography you want", and "Most reasonably clean `.bib` files that work with BibTeX should work out of the box" are.

Some tips to keep in mind when editing a `.bib` file to be used with `DocumenterCitations`:

* Use unicode instead of [escaped symbols](https://www.bibtex.org/SpecialSymbols/).
* You do not need to use [braces to protect capitalization](https://texfaq.org/FAQ-capbibtex). `DocumenterCitations` is not always able to remove such braces. But, unlike `bibtex`, `DocumenterCitation` will preserve the capitalization of titles.
* Use unicode instead of [escaped symbols](https://www.bibtex.org/SpecialSymbols/). Nowadays, `bibtex`/`pdflatex` seems to handle unicode without problems, so it's best to keep your `.bib` files in unicode.
* You do not need to use [braces to protect capitalization](https://texfaq.org/FAQ-capbibtex). Unlike `bibtex`, `DocumenterCitation` will preserve the capitalization of titles. You should always put the title in the `.bib` file as it appears in the published paper.
* Use a consistent scheme for citation keys. Shorter keys are better.
* All entries should have a `Doi` field, or a `Url` field if no DOI is available.
* You may have to work around some bugs in [BibParser.jl](https://github.com/Humans-of-Julia/BibParser.jl]). For example, the parser [does not properly recognize organization names as authors](https://github.com/Humans-of-Julia/BibParser.jl/issues/30). A [workaround](https://github.com/JuliaDocs/DocumenterCitations.jl/issues/44#issuecomment-1762167119) is to use non-breaking spaces in the name.
* Use `@string` macros for abbreviated journal names, with the caveat of [#31](https://github.com/Humans-of-Julia/BibParser.jl/issues/31) and [#32](https://github.com/Humans-of-Julia/BibParser.jl/issues/32) in the [BibParser.jl issues](https://github.com/Humans-of-Julia/BibParser.jl/issues).


Expand Down
2 changes: 2 additions & 0 deletions src/DocumenterCitations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ using Documenter.Writers.HTMLWriter
import MarkdownAST
import AbstractTrees

using Logging
using Markdown
using Bibliography: Bibliography, xyear, xlink, xtitle
using OrderedCollections: OrderedDict, OrderedSet
Expand Down Expand Up @@ -197,6 +198,7 @@ include("expand_citations.jl")
include("latex_options.jl")
include("bibliography_node.jl")
include("expand_bibliography.jl")
include("tex_to_markdown.jl")
include("formatting.jl")
include("labeled_styles_utils.jl")

Expand Down
87 changes: 34 additions & 53 deletions src/formatting.jl
Original file line number Diff line number Diff line change
@@ -1,53 +1,5 @@
# helper functions to render references in various styles

const tex2unicode_chars = Dict(
'o' => "\u00F8", # \o ø latin small letter O with stroke
'O' => "\u00D8", # \O Ø latin capital letter O with stroke
'l' => "\u0142", # \l ł latin small letter L with stroke
'L' => "\u0141", # \L Ł latin capital letter L with stroke
'i' => "\u0131", # \i ı latin small letter dotless I
)

const tex2unicode_replacements = (
"---" => "", # em dash needs to go first
"--" => "",

# do this before tex2unicode_chars or it wont be recognized
r"\\\\\"\{\\i\}" => s"\u0069\u308", # \"{\i} ï Latin Small Letter I with Diaeresis

# replace quoted single letters before the remaining replacements, and do
# them all at once, as these patterns rely on word boundaries which can
# change due to the replacements we perform
r"\\[oOlLi]\b" => c -> tex2unicode_chars[c[2]],
r"\\`\{(\S{1})\}" => s"\1\u300", # \`{o} ò grave accent
r"\\'\{(\S{1})\}" => s"\1\u301", # \'{o} ó acute accent
r"\\\^\{(\S{1})\}" => s"\1\u302", # \^{o} ô circumflex
r"\\~\{(\S{1})\}" => s"\1\u303", # \~{o} õ tilde
r"\\=\{(\S{1})\}" => s"\1\u304", # \={o} ō macron accent (a bar over the letter)
r"\\u\{(\S{1})\}" => s"\1\u306", # \u{o} ŏ breve over the letter
r"\\\.\{(\S{1})\}" => s"\1\u307", # \.{o} ȯ dot over the letter
r"\\\\\"\{(\S{1})\}" => s"\1\u308", # \"{o} ö umlaut, trema or dieresis
r"\\r\{(\S{1})\}" => s"\1\u30A", # \r{a} å ring over the letter (for å there is also the special command \aa)
r"\\H\{(\S{1})\}" => s"\1\u30B", # \H{o} ő long Hungarian umlaut (double acute)
r"\\v\{(\S{1})\}" => s"\1\u30C", # \v{s} š caron/háček ("v") over the letter
r"\\d\{(\S{1})\}" => s"\1\u323", # \d{u} ụ dot under the letter
r"\\c\{(\S{1})\}" => s"\1\u327", # \c{c} ç cedilla
r"\\k\{(\S{1})\}" => s"\1\u328", # \k{a} ą ogonek
r"\\b\{(\S{1})\}" => s"\1\u331", # \b{b} ḇ bar under the letter
r"\\t\{(\S{1})(\S{1})\}" => s"\1\u0361\2", # \t{oo} o͡o "tie" (inverted u) over the two letters
r"\{\}" => s"", # empty curly braces should not have any effect
r"\{([\w-]+)\}" => s"\1", # {<text>} <text> bracket stripping after applying all rules

# Sources : https://www.compart.com/en/unicode/U+0131 enter the unicode character into the search box
)

function tex2unicode(s)
for replacement in tex2unicode_replacements
s = replace(s, replacement)
end
return Unicode.normalize(s)
end

function linkify(text, link)
if isempty(text)
text = link
Expand Down Expand Up @@ -93,7 +45,8 @@ end
function alpha_label(entry)
year = isempty(entry.date.year) ? "??" : two_digit_year(entry.date.year)
if length(entry.authors) == 1
name = Unicode.normalize(entry.authors[1].last; stripmark=true)
name = tex_to_markdown(entry.authors[1].last)
name = Unicode.normalize(name; stripmark=true)
return uppercasefirst(first(name, 3)) * year
else
letters = [_alpha_initial(name) for name in first(entry.authors, 4)]
Expand Down Expand Up @@ -199,14 +152,16 @@ function format_names(
else
str = join(formatted_names, namesep)
end
str = tex_to_markdown(replace(str, r"[\n\r ]+" => " "))
if needs_et_al
str *= " $et_al_text"
end
return replace(str, r"[\n\r ]+" => " ")
return str
end


function format_published_in(entry; include_date=true, nbsp="\u00A0")
function format_published_in(entry; include_date=true, nbsp="\u00A0", link_doi=true)
# TODO: option to transform case of title
str = ""
if entry.type == "article"
str *= replace(entry.in.journal, " " => nbsp) # non-breaking space
Expand Down Expand Up @@ -288,12 +243,38 @@ function format_published_in(entry; include_date=true, nbsp="\u00A0")
if include_date && !isempty(entry.date.year)
str *= " ($(entry.date.year))"
end
return str
mdtext = tex_to_markdown(str)
if link_doi
link = _doi_link(entry)
return linkify(mdtext, link)
else
return mdtext
end
end


function format_title(entry; italicize=true, link_url=true)
# TODO: option to transform case of title
title = tex_to_markdown(xtitle(entry))
already_italics = startswith(title, "*") || endswith(title, "*")
if !isempty(title) && italicize && !already_italics
title = "*" * title * "*"
end
if link_url
title = linkify(title, entry.access.url)
end
return title
end


function format_note(entry)
return strip(get(entry.fields, "note", "")) |> tex2unicode
return strip(get(entry.fields, "note", "")) |> tex_to_markdown
end


function format_year(entry)
year = entry.date.year |> tex_to_markdown
return year
end


Expand Down
20 changes: 7 additions & 13 deletions src/labeled_styles_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,8 @@ function _format_labeled_citet(
for (i, key) in enumerate(cit.keys)
try
entry = entries[key]
names = tex2unicode(
names =
format_names(entry; names=namesfmt, and=true, et_al, et_al_text="*et al.*")
)
if i == 1 && cit.capitalize
names = uppercasefirst(names)
end
Expand Down Expand Up @@ -226,22 +225,17 @@ mdstr = format_labeled_bibliography_reference(style, entry; namesfmt=:last)
* `namesfmt`: How to format the author names (`:full`, `:last`, `:lastonly`)
"""
function format_labeled_bibliography_reference(style, entry; namesfmt=:last)
authors = format_names(entry; names=namesfmt) |> tex2unicode
title = xtitle(entry)
if !isempty(title)
title = "*" * tex2unicode(title) * "*"
end
linked_title = linkify(title, entry.access.url)
published_in = linkify(tex2unicode(format_published_in(entry)), _doi_link(entry))

authors = format_names(entry; names=namesfmt)
title = format_title(entry)
published_in = format_published_in(entry)
eprint = format_eprint(entry)
note = format_note(entry)
parts = String[]
for part in (authors, linked_title, published_in, eprint, note)
for part in (authors, title, published_in, eprint, note)
if !isempty(part)
push!(parts, part)
end
end
html = _join_bib_parts(parts)
return html
mdtext = _join_bib_parts(parts)
return mdtext
end
27 changes: 9 additions & 18 deletions src/styles/authoryear.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,14 @@ function format_authoryear_citation(
rethrow()
end
end
names = tex2unicode(
format_names(entry; names=namesfmt, and=true, et_al, et_al_text="*et al.*")
)
names = format_names(entry; names=namesfmt, and=true, et_al, et_al_text="*et al.*")
if isempty(names)
names = empty_names
end
if i == 1 && cit.capitalize
names = uppercasefirst(names)
end
year = tex2unicode(entry.date.year)
year = format_year(entry)
if isempty(year)
year = empty_year
end
Expand Down Expand Up @@ -126,33 +124,26 @@ function format_authoryear_bibliography_reference(
namesfmt=:lastfirst,
empty_names=""
)
authors = format_names(entry; names=namesfmt) |> tex2unicode
year = entry.date.year |> tex2unicode
authors = format_names(entry; names=namesfmt)
year = format_year(entry)
if !isempty(year)
if isempty(authors)
authors = empty_names
end
year = "($year)"
end
title = xtitle(entry)
if !isempty(title)
title = "*" * tex2unicode(title) * "*"
end
linked_title = linkify(title, entry.access.url)
published_in = linkify(
tex2unicode(format_published_in(entry; include_date=false)),
_doi_link(entry)
)
title = format_title(entry)
published_in = format_published_in(entry; include_date=false)
eprint = format_eprint(entry)
note = format_note(entry)
parts = String[]
for part in (authors, year, linked_title, published_in, eprint, note)
for part in (authors, year, title, published_in, eprint, note)
if !isempty(part)
push!(parts, part)
end
end
html = _join_bib_parts(parts)
return html
mdtext = _join_bib_parts(parts)
return mdtext
end


Expand Down
Loading

0 comments on commit 975a5a2

Please sign in to comment.