Skip to content

Commit

Permalink
New TOC CD-TEXT string decoding
Browse files Browse the repository at this point in the history
This patch replaces the previous broken approach to TOC string decoding
that used `.encode().decode('unicode_escape')` with proper parsing of
the escape sequences cdrdao is known to generate.

The new parser is also lenient with invalid escape sequences, that can
occur due to improper escaping in cdrdao. See:
cdrdao/cdrdao#32

Latin-1:

This new parsing method should work for Latin-1 strings for both old and
new versions of cdrdao, as long as those strings don't trigger the
improper escaping issues in upstream cdrdao.

This has been verified with the album Diorama from the Danish black
metal band MØL.

MS-JIS:

This new parsing method should also work for MS-JIS strings as long as
the .toc file was generated by cdrdao 1.2.5+ and the strings don't
trigger improper escaping issues in upstream cdrdao.

Unfortunately, I don't have any CD with CD-Text in MS-JIS, so I could
not verify this.

cdrdao versions before 1.2.5 will still cause whipper to produce
mojibake (garbled characters) when reading MS-JIS CD-Text, as those
versions do not encode strings in UTF-8.

Other encodings:

As far as I know, CD-Text only supports officially ASCII, Latin-1 and
MS-JIS, but I wouldn't be surprised if there are unofficial encodings
out there, given the strange strings I've seen in some bug reports.

If you have a CD with garbled CD-Text, please submit a bug report
indicating the performer, album name, language and attach the .toc file
so that the produced strings can be compared to the expected text.

Fixes whipper-team#169

Fixes whipper-team#183

Signed-off-by: Alicia Boya García <[email protected]>
  • Loading branch information
ntrrgc committed Aug 27, 2024
1 parent a4b9742 commit ac16fbc
Show file tree
Hide file tree
Showing 6 changed files with 472 additions and 6 deletions.
62 changes: 56 additions & 6 deletions whipper/image/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,61 @@
logger = logging.getLogger(__name__)

# shared
_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
_CDTEXT_CANDIDATE_RE = re.compile(r'''
(?P<key>\w+) # CD-TEXT key.
\s+
"(?P<value> # CD-TEXT value.
(?:
\\\\ # escaped backslash.
| \\" # escaped double-quote.
| [^"] # not a double-quote.
)+? # the value must not be empty.
)"
''', flags=re.VERBOSE)

_STRING_SUBSTITUTIONS_RE = re.compile(r'''
\\(?P<octal>[0-8][0-8][0-8])
| \\"
| \\\\
''', flags=re.VERBOSE)


def _string_contents_repl(match: 're.Match[str]') -> str:
group_octal = match.group('octal')
if group_octal is not None:
code_point = int(group_octal, base=8)
return chr(code_point)

entire_match = match.group(0)
if entire_match == '\\"':
return '"'
elif entire_match == '\\\\':
return '\\'
else:
raise RuntimeError("unexpected match: ", entire_match)


def parse_toc_string(str_within_quotes: str) -> str:
"""
Given the a quoted string obtained from a TOC file using
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
Backslash substitutions fail gracefully, which is important since cdrdao
string encoding has been found to be flawed as recently as cdrdao 1.2.5
(2023):
https://github.com/cdrdao/cdrdao/issues/32
https://github.com/whipper-team/whipper/issues/169
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
is passed, provides UTF-8 strings. It also works with older versions as long
as the encoding was ASCII or Latin-1.
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
(garbled characters), just like the older code this function replaced.
"""
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
str_within_quotes)


# header
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
Expand Down Expand Up @@ -208,11 +262,7 @@ def parse(self):
m = _CDTEXT_CANDIDATE_RE.search(line)
if m:
key = m.group('key')
value = m.group('value')
# usually, value is encoded with octal escapes and in latin-1
# FIXME: other encodings are possible, does cdrdao handle
# them ?
value = value.encode().decode('unicode_escape')
value = parse_toc_string(m.group('value'))
if key in table.CDTEXT_FIELDS:
# FIXME: consider ISRC separate for now, but this
# is a limitation of our parser approach
Expand Down
45 changes: 45 additions & 0 deletions whipper/test/diorama.cue
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
REM DISCID 700AC908
REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
PERFORMER "MØL"
TITLE "Diorama"
FILE "data.wav" WAVE
TRACK 01 AUDIO
PERFORMER "MØL"
TITLE "Fraktur"
ISRC DED832100085
INDEX 01 00:00:00
TRACK 02 AUDIO
PERFORMER "MØL"
TITLE "Photophobic"
ISRC DED832100086
INDEX 01 04:19:00
TRACK 03 AUDIO
PERFORMER "MØL"
TITLE "Serf"
ISRC DED832100087
INDEX 01 09:37:00
TRACK 04 AUDIO
PERFORMER "MØL"
TITLE "Vestige"
ISRC DED832100088
INDEX 01 14:59:12
TRACK 05 AUDIO
PERFORMER "MØL"
TITLE "Redacted"
ISRC DED832100089
INDEX 01 20:37:68
TRACK 06 AUDIO
PERFORMER "MØL"
TITLE "Itinerari"
ISRC DED832100090
INDEX 01 25:54:18
TRACK 07 AUDIO
PERFORMER "MØL"
TITLE "Tvesind"
ISRC DED832100091
INDEX 01 30:57:58
TRACK 08 AUDIO
PERFORMER "MØL"
TITLE "Diorama"
ISRC DED832100092
INDEX 01 38:46:57
134 changes: 134 additions & 0 deletions whipper/test/diorama_noutf8.toc
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
CD_DA

CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}

// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "M\330L"
}
}
FILE "data.wav" 0 04:19:00


// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "M\330L"
}
}
FILE "data.wav" 04:19:00 05:18:00


// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "M\330L"
}
}
FILE "data.wav" 09:37:00 05:22:12


// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "M\330L"
}
}
FILE "data.wav" 14:59:12 05:38:56


// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "M\330L"
}
}
FILE "data.wav" 20:37:68 05:16:25


// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "M\330L"
}
}
FILE "data.wav" 25:54:18 05:03:40


// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "M\330L"
}
}
FILE "data.wav" 30:57:58 07:48:74


// Track 10
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
}
}
FILE "data.wav" 38:46:57 07:14:36

134 changes: 134 additions & 0 deletions whipper/test/diorama_utf8.toc
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
CD_DA

CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}

// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "MØL"
}
}
FILE "data.wav" 0 04:19:00


// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "MØL"
}
}
FILE "data.wav" 04:19:00 05:18:00


// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "MØL"
}
}
FILE "data.wav" 09:37:00 05:22:12


// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "MØL"
}
}
FILE "data.wav" 14:59:12 05:38:56


// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "MØL"
}
}
FILE "data.wav" 20:37:68 05:16:25


// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "MØL"
}
}
FILE "data.wav" 25:54:18 05:03:40


// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "MØL"
}
}
FILE "data.wav" 30:57:58 07:48:74


// Track 8
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
}
}
FILE "data.wav" 38:46:57 07:14:36

Loading

0 comments on commit ac16fbc

Please sign in to comment.