Skip to content

Commit

Permalink
Merge pull request #633 from ntrrgc/toc-string-parsing
Browse files Browse the repository at this point in the history
New TOC CD-TEXT string decoding
  • Loading branch information
MerlijnWajer authored Nov 25, 2024
2 parents fb1440b + 4719c74 commit 799fd92
Show file tree
Hide file tree
Showing 6 changed files with 472 additions and 6 deletions.
62 changes: 56 additions & 6 deletions whipper/image/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,61 @@
logger = logging.getLogger(__name__)

# shared
_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
_CDTEXT_CANDIDATE_RE = re.compile(r'''
(?P<key>\w+) # CD-TEXT key.
\s+
"(?P<value> # CD-TEXT value.
(?:
\\\\ # escaped backslash.
| \\" # escaped double-quote.
| [^"] # not a double-quote.
)+? # the value must not be empty.
)"
''', flags=re.VERBOSE)

_STRING_SUBSTITUTIONS_RE = re.compile(r'''
\\(?P<octal>[0-8][0-8][0-8])
| \\"
| \\\\
''', flags=re.VERBOSE)


def _string_contents_repl(match: 're.Match[str]') -> str:
group_octal = match.group('octal')
if group_octal is not None:
code_point = int(group_octal, base=8)
return chr(code_point)

entire_match = match.group(0)
if entire_match == '\\"':
return '"'
elif entire_match == '\\\\':
return '\\'
else:
raise RuntimeError("unexpected match: ", entire_match)


def parse_toc_string(str_within_quotes: str) -> str:
"""
Given the a quoted string obtained from a TOC file using
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
Backslash substitutions fail gracefully, which is important since cdrdao
string encoding has been found to be flawed as recently as cdrdao 1.2.5
(2023):
https://github.com/cdrdao/cdrdao/issues/32
https://github.com/whipper-team/whipper/issues/169
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
is passed, provides UTF-8 strings. It also works with older versions as long
as the encoding was ASCII or Latin-1.
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
(garbled characters), just like the older code this function replaced.
"""
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
str_within_quotes)


# header
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
Expand Down Expand Up @@ -208,11 +262,7 @@ def parse(self):
m = _CDTEXT_CANDIDATE_RE.search(line)
if m:
key = m.group('key')
value = m.group('value')
# usually, value is encoded with octal escapes and in latin-1
# FIXME: other encodings are possible, does cdrdao handle
# them ?
value = value.encode().decode('unicode_escape')
value = parse_toc_string(m.group('value'))
if key in table.CDTEXT_FIELDS:
# FIXME: consider ISRC separate for now, but this
# is a limitation of our parser approach
Expand Down
45 changes: 45 additions & 0 deletions whipper/test/diorama.cue
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
REM DISCID 700AC908
REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
PERFORMER "MØL"
TITLE "Diorama"
FILE "data.wav" WAVE
TRACK 01 AUDIO
PERFORMER "MØL"
TITLE "Fraktur"
ISRC DED832100085
INDEX 01 00:00:00
TRACK 02 AUDIO
PERFORMER "MØL"
TITLE "Photophobic"
ISRC DED832100086
INDEX 01 04:19:00
TRACK 03 AUDIO
PERFORMER "MØL"
TITLE "Serf"
ISRC DED832100087
INDEX 01 09:37:00
TRACK 04 AUDIO
PERFORMER "MØL"
TITLE "Vestige"
ISRC DED832100088
INDEX 01 14:59:12
TRACK 05 AUDIO
PERFORMER "MØL"
TITLE "Redacted"
ISRC DED832100089
INDEX 01 20:37:68
TRACK 06 AUDIO
PERFORMER "MØL"
TITLE "Itinerari"
ISRC DED832100090
INDEX 01 25:54:18
TRACK 07 AUDIO
PERFORMER "MØL"
TITLE "Tvesind"
ISRC DED832100091
INDEX 01 30:57:58
TRACK 08 AUDIO
PERFORMER "MØL"
TITLE "Diorama"
ISRC DED832100092
INDEX 01 38:46:57
134 changes: 134 additions & 0 deletions whipper/test/diorama_noutf8.toc
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
CD_DA

CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}

// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "M\330L"
}
}
FILE "data.wav" 0 04:19:00


// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "M\330L"
}
}
FILE "data.wav" 04:19:00 05:18:00


// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "M\330L"
}
}
FILE "data.wav" 09:37:00 05:22:12


// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "M\330L"
}
}
FILE "data.wav" 14:59:12 05:38:56


// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "M\330L"
}
}
FILE "data.wav" 20:37:68 05:16:25


// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "M\330L"
}
}
FILE "data.wav" 25:54:18 05:03:40


// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "M\330L"
}
}
FILE "data.wav" 30:57:58 07:48:74


// Track 10
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
}
}
FILE "data.wav" 38:46:57 07:14:36

134 changes: 134 additions & 0 deletions whipper/test/diorama_utf8.toc
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
CD_DA

CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}

// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "MØL"
}
}
FILE "data.wav" 0 04:19:00


// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "MØL"
}
}
FILE "data.wav" 04:19:00 05:18:00


// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "MØL"
}
}
FILE "data.wav" 09:37:00 05:22:12


// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "MØL"
}
}
FILE "data.wav" 14:59:12 05:38:56


// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "MØL"
}
}
FILE "data.wav" 20:37:68 05:16:25


// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "MØL"
}
}
FILE "data.wav" 25:54:18 05:03:40


// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "MØL"
}
}
FILE "data.wav" 30:57:58 07:48:74


// Track 8
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
}
}
FILE "data.wav" 38:46:57 07:14:36

Loading

0 comments on commit 799fd92

Please sign in to comment.