Skip to content

Commit 799fd92

Browse files
authored
Merge pull request #633 from ntrrgc/toc-string-parsing
New TOC CD-TEXT string decoding
2 parents fb1440b + 4719c74 commit 799fd92

File tree

6 files changed

+472
-6
lines changed

6 files changed

+472
-6
lines changed

whipper/image/toc.py

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,61 @@
3333
logger = logging.getLogger(__name__)
3434

3535
# shared
36-
_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
36+
_CDTEXT_CANDIDATE_RE = re.compile(r'''
37+
(?P<key>\w+) # CD-TEXT key.
38+
\s+
39+
"(?P<value> # CD-TEXT value.
40+
(?:
41+
\\\\ # escaped backslash.
42+
| \\" # escaped double-quote.
43+
| [^"] # not a double-quote.
44+
)+? # the value must not be empty.
45+
)"
46+
''', flags=re.VERBOSE)
47+
48+
_STRING_SUBSTITUTIONS_RE = re.compile(r'''
49+
\\(?P<octal>[0-8][0-8][0-8])
50+
| \\"
51+
| \\\\
52+
''', flags=re.VERBOSE)
53+
54+
55+
def _string_contents_repl(match: 're.Match[str]') -> str:
56+
group_octal = match.group('octal')
57+
if group_octal is not None:
58+
code_point = int(group_octal, base=8)
59+
return chr(code_point)
60+
61+
entire_match = match.group(0)
62+
if entire_match == '\\"':
63+
return '"'
64+
elif entire_match == '\\\\':
65+
return '\\'
66+
else:
67+
raise RuntimeError("unexpected match: ", entire_match)
68+
69+
70+
def parse_toc_string(str_within_quotes: str) -> str:
71+
"""
72+
Given the a quoted string obtained from a TOC file using
73+
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
74+
75+
Backslash substitutions fail gracefully, which is important since cdrdao
76+
string encoding has been found to be flawed as recently as cdrdao 1.2.5
77+
(2023):
78+
https://github.com/cdrdao/cdrdao/issues/32
79+
https://github.com/whipper-team/whipper/issues/169
80+
81+
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
82+
is passed, provides UTF-8 strings. It also works with older versions as long
83+
as the encoding was ASCII or Latin-1.
84+
85+
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
86+
(garbled characters), just like the older code this function replaced.
87+
"""
88+
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
89+
str_within_quotes)
90+
3791

3892
# header
3993
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
@@ -208,11 +262,7 @@ def parse(self):
208262
m = _CDTEXT_CANDIDATE_RE.search(line)
209263
if m:
210264
key = m.group('key')
211-
value = m.group('value')
212-
# usually, value is encoded with octal escapes and in latin-1
213-
# FIXME: other encodings are possible, does cdrdao handle
214-
# them ?
215-
value = value.encode().decode('unicode_escape')
265+
value = parse_toc_string(m.group('value'))
216266
if key in table.CDTEXT_FIELDS:
217267
# FIXME: consider ISRC separate for now, but this
218268
# is a limitation of our parser approach

whipper/test/diorama.cue

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
REM DISCID 700AC908
2+
REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
3+
PERFORMER "MØL"
4+
TITLE "Diorama"
5+
FILE "data.wav" WAVE
6+
TRACK 01 AUDIO
7+
PERFORMER "MØL"
8+
TITLE "Fraktur"
9+
ISRC DED832100085
10+
INDEX 01 00:00:00
11+
TRACK 02 AUDIO
12+
PERFORMER "MØL"
13+
TITLE "Photophobic"
14+
ISRC DED832100086
15+
INDEX 01 04:19:00
16+
TRACK 03 AUDIO
17+
PERFORMER "MØL"
18+
TITLE "Serf"
19+
ISRC DED832100087
20+
INDEX 01 09:37:00
21+
TRACK 04 AUDIO
22+
PERFORMER "MØL"
23+
TITLE "Vestige"
24+
ISRC DED832100088
25+
INDEX 01 14:59:12
26+
TRACK 05 AUDIO
27+
PERFORMER "MØL"
28+
TITLE "Redacted"
29+
ISRC DED832100089
30+
INDEX 01 20:37:68
31+
TRACK 06 AUDIO
32+
PERFORMER "MØL"
33+
TITLE "Itinerari"
34+
ISRC DED832100090
35+
INDEX 01 25:54:18
36+
TRACK 07 AUDIO
37+
PERFORMER "MØL"
38+
TITLE "Tvesind"
39+
ISRC DED832100091
40+
INDEX 01 30:57:58
41+
TRACK 08 AUDIO
42+
PERFORMER "MØL"
43+
TITLE "Diorama"
44+
ISRC DED832100092
45+
INDEX 01 38:46:57

whipper/test/diorama_noutf8.toc

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
CD_DA
2+
3+
CD_TEXT {
4+
LANGUAGE_MAP {
5+
0: 9
6+
}
7+
LANGUAGE 0 {
8+
TITLE "Diorama"
9+
PERFORMER "M\330L"
10+
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
11+
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
12+
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
13+
}
14+
}
15+
16+
// Track 1
17+
TRACK AUDIO
18+
NO COPY
19+
NO PRE_EMPHASIS
20+
TWO_CHANNEL_AUDIO
21+
ISRC "DED832100085"
22+
CD_TEXT {
23+
LANGUAGE 0 {
24+
TITLE "Fraktur"
25+
PERFORMER "M\330L"
26+
}
27+
}
28+
FILE "data.wav" 0 04:19:00
29+
30+
31+
// Track 2
32+
TRACK AUDIO
33+
NO COPY
34+
NO PRE_EMPHASIS
35+
TWO_CHANNEL_AUDIO
36+
ISRC "DED832100086"
37+
CD_TEXT {
38+
LANGUAGE 0 {
39+
TITLE "Photophobic"
40+
PERFORMER "M\330L"
41+
}
42+
}
43+
FILE "data.wav" 04:19:00 05:18:00
44+
45+
46+
// Track 3
47+
TRACK AUDIO
48+
NO COPY
49+
NO PRE_EMPHASIS
50+
TWO_CHANNEL_AUDIO
51+
ISRC "DED832100087"
52+
CD_TEXT {
53+
LANGUAGE 0 {
54+
TITLE "Serf"
55+
PERFORMER "M\330L"
56+
}
57+
}
58+
FILE "data.wav" 09:37:00 05:22:12
59+
60+
61+
// Track 4
62+
TRACK AUDIO
63+
NO COPY
64+
NO PRE_EMPHASIS
65+
TWO_CHANNEL_AUDIO
66+
ISRC "DED832100088"
67+
CD_TEXT {
68+
LANGUAGE 0 {
69+
TITLE "Vestige"
70+
PERFORMER "M\330L"
71+
}
72+
}
73+
FILE "data.wav" 14:59:12 05:38:56
74+
75+
76+
// Track 5
77+
TRACK AUDIO
78+
NO COPY
79+
NO PRE_EMPHASIS
80+
TWO_CHANNEL_AUDIO
81+
ISRC "DED832100089"
82+
CD_TEXT {
83+
LANGUAGE 0 {
84+
TITLE "Redacted"
85+
PERFORMER "M\330L"
86+
}
87+
}
88+
FILE "data.wav" 20:37:68 05:16:25
89+
90+
91+
// Track 6
92+
TRACK AUDIO
93+
NO COPY
94+
NO PRE_EMPHASIS
95+
TWO_CHANNEL_AUDIO
96+
ISRC "DED832100090"
97+
CD_TEXT {
98+
LANGUAGE 0 {
99+
TITLE "Itinerari"
100+
PERFORMER "M\330L"
101+
}
102+
}
103+
FILE "data.wav" 25:54:18 05:03:40
104+
105+
106+
// Track 7
107+
TRACK AUDIO
108+
NO COPY
109+
NO PRE_EMPHASIS
110+
TWO_CHANNEL_AUDIO
111+
ISRC "DED832100091"
112+
CD_TEXT {
113+
LANGUAGE 0 {
114+
TITLE "Tvesind"
115+
PERFORMER "M\330L"
116+
}
117+
}
118+
FILE "data.wav" 30:57:58 07:48:74
119+
120+
121+
// Track 10
122+
TRACK AUDIO
123+
NO COPY
124+
NO PRE_EMPHASIS
125+
TWO_CHANNEL_AUDIO
126+
ISRC "DED832100092"
127+
CD_TEXT {
128+
LANGUAGE 0 {
129+
TITLE "Diorama"
130+
PERFORMER "M\330L"
131+
}
132+
}
133+
FILE "data.wav" 38:46:57 07:14:36
134+

whipper/test/diorama_utf8.toc

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
CD_DA
2+
3+
CD_TEXT {
4+
LANGUAGE_MAP {
5+
0: 9
6+
}
7+
LANGUAGE 0 {
8+
TITLE "Diorama"
9+
PERFORMER "MØL"
10+
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
11+
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
12+
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
13+
}
14+
}
15+
16+
// Track 1
17+
TRACK AUDIO
18+
NO COPY
19+
NO PRE_EMPHASIS
20+
TWO_CHANNEL_AUDIO
21+
ISRC "DED832100085"
22+
CD_TEXT {
23+
LANGUAGE 0 {
24+
TITLE "Fraktur"
25+
PERFORMER "MØL"
26+
}
27+
}
28+
FILE "data.wav" 0 04:19:00
29+
30+
31+
// Track 2
32+
TRACK AUDIO
33+
NO COPY
34+
NO PRE_EMPHASIS
35+
TWO_CHANNEL_AUDIO
36+
ISRC "DED832100086"
37+
CD_TEXT {
38+
LANGUAGE 0 {
39+
TITLE "Photophobic"
40+
PERFORMER "MØL"
41+
}
42+
}
43+
FILE "data.wav" 04:19:00 05:18:00
44+
45+
46+
// Track 3
47+
TRACK AUDIO
48+
NO COPY
49+
NO PRE_EMPHASIS
50+
TWO_CHANNEL_AUDIO
51+
ISRC "DED832100087"
52+
CD_TEXT {
53+
LANGUAGE 0 {
54+
TITLE "Serf"
55+
PERFORMER "MØL"
56+
}
57+
}
58+
FILE "data.wav" 09:37:00 05:22:12
59+
60+
61+
// Track 4
62+
TRACK AUDIO
63+
NO COPY
64+
NO PRE_EMPHASIS
65+
TWO_CHANNEL_AUDIO
66+
ISRC "DED832100088"
67+
CD_TEXT {
68+
LANGUAGE 0 {
69+
TITLE "Vestige"
70+
PERFORMER "MØL"
71+
}
72+
}
73+
FILE "data.wav" 14:59:12 05:38:56
74+
75+
76+
// Track 5
77+
TRACK AUDIO
78+
NO COPY
79+
NO PRE_EMPHASIS
80+
TWO_CHANNEL_AUDIO
81+
ISRC "DED832100089"
82+
CD_TEXT {
83+
LANGUAGE 0 {
84+
TITLE "Redacted"
85+
PERFORMER "MØL"
86+
}
87+
}
88+
FILE "data.wav" 20:37:68 05:16:25
89+
90+
91+
// Track 6
92+
TRACK AUDIO
93+
NO COPY
94+
NO PRE_EMPHASIS
95+
TWO_CHANNEL_AUDIO
96+
ISRC "DED832100090"
97+
CD_TEXT {
98+
LANGUAGE 0 {
99+
TITLE "Itinerari"
100+
PERFORMER "MØL"
101+
}
102+
}
103+
FILE "data.wav" 25:54:18 05:03:40
104+
105+
106+
// Track 7
107+
TRACK AUDIO
108+
NO COPY
109+
NO PRE_EMPHASIS
110+
TWO_CHANNEL_AUDIO
111+
ISRC "DED832100091"
112+
CD_TEXT {
113+
LANGUAGE 0 {
114+
TITLE "Tvesind"
115+
PERFORMER "MØL"
116+
}
117+
}
118+
FILE "data.wav" 30:57:58 07:48:74
119+
120+
121+
// Track 8
122+
TRACK AUDIO
123+
NO COPY
124+
NO PRE_EMPHASIS
125+
TWO_CHANNEL_AUDIO
126+
ISRC "DED832100092"
127+
CD_TEXT {
128+
LANGUAGE 0 {
129+
TITLE "Diorama"
130+
PERFORMER "MØL"
131+
}
132+
}
133+
FILE "data.wav" 38:46:57 07:14:36
134+

0 commit comments

Comments
 (0)