Skip to content

Commit ce1d8b5

Browse files
committed
Handle names that have more than 2 parts
Handle surnames with prefixes like von, van etc.
1 parent e40eb19 commit ce1d8b5

File tree

2 files changed

+149
-82
lines changed

2 files changed

+149
-82
lines changed

VndbCharacterNames/Program.cs

Lines changed: 141 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -114,28 +114,30 @@ public static void Main(string[] args)
114114
}
115115

116116
_ = convertedRecords.Add(new ConvertedNameRecord(fullNameWithoutAnyWhiteSpace, vndbNameRecord.FullNameInRomaji, vndbNameRecord.Sex, definition));
117-
(NameRecord surnameRecord, NameRecord givenNameRecord)? surnameAndNameRecords = GetSurnameAndNameRecords(vndbNameRecord.FullName, vndbNameRecord.FullNameInRomaji);
117+
List<(NameRecord surnameRecord, NameRecord givenNameRecord)>? surnameAndNameRecords = GetSurnameAndNameRecords(vndbNameRecord.FullName, vndbNameRecord.FullNameInRomaji);
118118
if (surnameAndNameRecords is not null)
119119
{
120-
NameRecord surnameRecord = surnameAndNameRecords.Value.surnameRecord;
121-
NameRecord givenNameRecord = surnameAndNameRecords.Value.surnameRecord;
120+
foreach ((NameRecord surnameRecord, NameRecord givenNameRecord) in surnameAndNameRecords)
121+
{
122+
nameTypesDict.AddIfNotExists(surnameRecord, Utils.SurnameNameType);
122123

123-
nameTypesDict.AddIfNotExists(surnameRecord, Utils.SurnameNameType);
124+
if (vndbNameRecord.Sex is not null)
125+
{
126+
nameTypesDict.AddIfNotExists(givenNameRecord, vndbNameRecord.Sex);
127+
}
124128

125-
if (vndbNameRecord.Sex is not null)
126-
{
127-
nameTypesDict.AddIfNotExists(givenNameRecord, vndbNameRecord.Sex);
129+
_ = convertedRecords.Add(new ConvertedNameRecord(surnameRecord.Name, surnameRecord.NameInRomaji));
130+
_ = convertedRecords.Add(new ConvertedNameRecord(givenNameRecord.Name, givenNameRecord.NameInRomaji));
128131
}
129132

130-
_ = convertedRecords.Add(new ConvertedNameRecord(surnameRecord.Name, surnameRecord.NameInRomaji));
131-
_ = convertedRecords.Add(new ConvertedNameRecord(givenNameRecord.Name, givenNameRecord.NameInRomaji));
132-
133133
List<NameRecord>? aliasRecords = vndbNameRecord.GetAliasPairs();
134134
if (aliasRecords is not null)
135135
{
136+
string[] surnames = surnameAndNameRecords.Select(static r => r.surnameRecord.Name).ToArray();
137+
string[] givenNames = surnameAndNameRecords.Select(static r => r.givenNameRecord.Name).ToArray();
136138
foreach (NameRecord aliasRecord in aliasRecords)
137139
{
138-
if (surnameRecord.Name != aliasRecord.Name && givenNameRecord.Name != aliasRecord.Name)
140+
if (!surnames.Contains(aliasRecord.Name) && !givenNames.Contains(aliasRecord.Name))
139141
{
140142
string fullAliasWithoutAnyWhiteSpace = string.Join("", aliasRecord.Name.Split());
141143
if (vndbNameRecord.Sex is not null)
@@ -144,26 +146,28 @@ public static void Main(string[] args)
144146
}
145147

146148
_ = convertedRecords.Add(new ConvertedNameRecord(fullAliasWithoutAnyWhiteSpace, aliasRecord.NameInRomaji, vndbNameRecord.Sex, definition));
147-
(NameRecord surnameRecord, NameRecord givenNameRecord)? aliasSurnameAndNameRecords = GetSurnameAndNameRecords(aliasRecord.Name, aliasRecord.NameInRomaji);
149+
List<(NameRecord surnameRecord, NameRecord givenNameRecord)>? aliasSurnameAndNameRecords = GetSurnameAndNameRecords(aliasRecord.Name, aliasRecord.NameInRomaji);
148150
if (aliasSurnameAndNameRecords is not null)
149151
{
150-
NameRecord aliasSurnameRecord = aliasSurnameAndNameRecords.Value.surnameRecord;
151-
NameRecord aliasGivenNameRecord = aliasSurnameAndNameRecords.Value.surnameRecord;
152-
153-
if (aliasSurnameRecord.Name != surnameRecord.Name)
152+
string[] aliasSurnames = aliasSurnameAndNameRecords.Select(static r => r.surnameRecord.Name).ToArray();
153+
string[] aliasGivenNames = aliasSurnameAndNameRecords.Select(static r => r.givenNameRecord.Name).ToArray();
154+
foreach ((NameRecord aliasSurnameRecord, NameRecord aliasGivenNameRecord) in aliasSurnameAndNameRecords)
154155
{
155-
nameTypesDict.AddIfNotExists(aliasSurnameRecord, Utils.SurnameNameType);
156-
_ = convertedRecords.Add(new ConvertedNameRecord(aliasSurnameRecord.Name, aliasSurnameRecord.NameInRomaji));
157-
}
158-
159-
if (aliasGivenNameRecord.Name != givenNameRecord.Name)
160-
{
161-
if (vndbNameRecord.Sex is not null)
156+
if (!aliasSurnames.Contains(aliasSurnameRecord.Name))
162157
{
163-
nameTypesDict.AddIfNotExists(aliasGivenNameRecord, vndbNameRecord.Sex);
158+
nameTypesDict.AddIfNotExists(aliasSurnameRecord, Utils.SurnameNameType);
159+
_ = convertedRecords.Add(new ConvertedNameRecord(aliasSurnameRecord.Name, aliasSurnameRecord.NameInRomaji));
164160
}
165161

166-
_ = convertedRecords.Add(new ConvertedNameRecord(aliasGivenNameRecord.Name, aliasGivenNameRecord.NameInRomaji));
162+
if (!aliasGivenNames.Contains(aliasGivenNameRecord.Name))
163+
{
164+
if (vndbNameRecord.Sex is not null)
165+
{
166+
nameTypesDict.AddIfNotExists(aliasGivenNameRecord, vndbNameRecord.Sex);
167+
}
168+
169+
_ = convertedRecords.Add(new ConvertedNameRecord(aliasGivenNameRecord.Name, aliasGivenNameRecord.NameInRomaji));
170+
}
167171
}
168172
}
169173
}
@@ -184,20 +188,29 @@ public static void Main(string[] args)
184188
}
185189

186190
_ = convertedRecords.Add(new ConvertedNameRecord(fullAliasWithoutAnyWhiteSpace, aliasRecord.NameInRomaji, vndbNameRecord.Sex, definition));
187-
(NameRecord surnameRecord, NameRecord givenNameRecord)? aliasSurnameAndNameRecords = GetSurnameAndNameRecords(aliasRecord.Name, aliasRecord.NameInRomaji);
191+
List<(NameRecord surnameRecord, NameRecord givenNameRecord)>? aliasSurnameAndNameRecords = GetSurnameAndNameRecords(aliasRecord.Name, aliasRecord.NameInRomaji);
188192
if (aliasSurnameAndNameRecords is not null)
189193
{
190-
NameRecord aliasSurnameRecord = aliasSurnameAndNameRecords.Value.surnameRecord;
191-
NameRecord aliasGivenNameRecord = aliasSurnameAndNameRecords.Value.surnameRecord;
194+
string[] aliasSurnames = aliasSurnameAndNameRecords.Select(static r => r.surnameRecord.Name).ToArray();
195+
string[] aliasGivenNames = aliasSurnameAndNameRecords.Select(static r => r.givenNameRecord.Name).ToArray();
196+
foreach ((NameRecord aliasSurnameRecord, NameRecord aliasGivenNameRecord) in aliasSurnameAndNameRecords)
197+
{
198+
if (!aliasSurnames.Contains(aliasSurnameRecord.Name))
199+
{
200+
nameTypesDict.AddIfNotExists(aliasSurnameRecord, Utils.SurnameNameType);
201+
_ = convertedRecords.Add(new ConvertedNameRecord(aliasSurnameRecord.Name, aliasSurnameRecord.NameInRomaji));
202+
}
192203

193-
nameTypesDict.AddIfNotExists(aliasSurnameRecord, Utils.SurnameNameType);
194-
_ = convertedRecords.Add(new ConvertedNameRecord(aliasSurnameRecord.Name, aliasSurnameRecord.NameInRomaji));
204+
if (!aliasGivenNames.Contains(aliasGivenNameRecord.Name))
205+
{
206+
if (vndbNameRecord.Sex is not null)
207+
{
208+
nameTypesDict.AddIfNotExists(aliasGivenNameRecord, vndbNameRecord.Sex);
209+
}
195210

196-
if (vndbNameRecord.Sex is not null)
197-
{
198-
nameTypesDict.AddIfNotExists(aliasGivenNameRecord, vndbNameRecord.Sex);
211+
_ = convertedRecords.Add(new ConvertedNameRecord(aliasGivenNameRecord.Name, aliasGivenNameRecord.NameInRomaji));
212+
}
199213
}
200-
_ = convertedRecords.Add(new ConvertedNameRecord(aliasGivenNameRecord.Name, aliasGivenNameRecord.NameInRomaji));
201214
}
202215
}
203216
}
@@ -259,64 +272,112 @@ public static void Main(string[] args)
259272
}
260273
}
261274

262-
private static (NameRecord surnameRecord, NameRecord givenNameRecord)? GetSurnameAndNameRecords(string fullName, string fullNameInRomaji)
275+
private static List<(NameRecord surnameRecord, NameRecord givenNameRecord)>? GetSurnameAndNameRecords(string fullName, string fullNameInRomaji)
263276
{
264-
string[] splitRomajiParts = fullNameInRomaji.Split((string[]?)null, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
265-
string? surname = null;
266-
string? surnameInRomaji = null;
267-
string? givenName = null;
268-
string? givenNameInRomaji = null;
269-
if (splitRomajiParts.Length > 1)
277+
if (!Utils.JapaneseRegex.IsMatch(fullName))
278+
{
279+
return null;
280+
}
281+
282+
string[] splitFullNameInRomajiParts = fullNameInRomaji.Split((string[]?)null, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
283+
if (splitFullNameInRomajiParts.Length <= 1)
284+
{
285+
return null;
286+
}
287+
288+
string[] splitFullNameParts = fullName.Split((string[]?)null, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
289+
if (splitFullNameParts.Length is 1)
270290
{
271-
string[] splitFullNameParts = fullName.Split((string[]?)null, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
272-
if (splitFullNameParts.Length is 1)
291+
splitFullNameParts = fullName.Split(['=', '=', '・', '・', '・'], StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
292+
}
293+
294+
if (splitFullNameParts.Length <= 1)
295+
{
296+
return null;
297+
}
298+
299+
if (splitFullNameInRomajiParts.Length != splitFullNameParts.Length)
300+
{
301+
return null;
302+
}
303+
304+
string firstFullNamePart = splitFullNameParts[0];
305+
string lastFullNamePart = splitFullNameParts[^1];
306+
bool firstFullNamePartIsKanjiOrHiragana = Utils.KanjiRegex.IsMatch(firstFullNamePart) || Utils.HiraganaRegex.IsMatch(firstFullNamePart);
307+
308+
if ((!splitFullNameParts.Any(Utils.KatakanaRegex.IsMatch) && !splitFullNameParts.Any(Utils.LatinRegex.IsMatch))
309+
|| (firstFullNamePartIsKanjiOrHiragana && (Utils.KanjiRegex.IsMatch(lastFullNamePart) || Utils.HiraganaRegex.IsMatch(lastFullNamePart)))
310+
|| (splitFullNameParts.Length is 2 && firstFullNamePartIsKanjiOrHiragana))
311+
{
312+
List<(NameRecord surnameRecord, NameRecord givenNameRecord)> surnameAndNameRecords = new(splitFullNameParts.Length - 1);
313+
NameRecord surnameRecord = new(splitFullNameParts[0], splitFullNameInRomajiParts[0]);
314+
for (int i = 1; i < splitFullNameParts.Length; i++)
273315
{
274-
splitFullNameParts = fullName.Split(['=', '='], StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
316+
surnameAndNameRecords.Add((surnameRecord, new NameRecord(splitFullNameParts[i], splitFullNameInRomajiParts[i])));
275317
}
276318

277-
if (splitFullNameParts.Length > 1)
319+
return surnameAndNameRecords;
320+
}
321+
else //if (!Utils.KanjiRegex.IsMatch(splitFullNameParts[0])
322+
// && (Utils.KatakanaRegex.IsMatch(splitFullNameParts[0])
323+
// || Utils.LatinRegex.IsMatch(splitFullNameParts[0])))
324+
{
325+
NameRecord surnameRecord = new(splitFullNameParts[^1], splitFullNameInRomajiParts[^1]);
326+
bool hasMoreThanTwoNameParts = splitFullNameParts.Length > 2;
327+
int surnameIndex = FindSurnameStartIndex(splitFullNameParts, splitFullNameInRomajiParts);
328+
List<(NameRecord surnameRecord, NameRecord givenNameRecord)> surnameAndNameRecords = new(hasMoreThanTwoNameParts ? surnameIndex : surnameIndex - 1);
329+
for (int i = 0; i < surnameIndex; i++)
278330
{
279-
if (splitRomajiParts.Length == splitFullNameParts.Length)
280-
{
281-
if (splitFullNameParts.Length is 2)
282-
{
283-
surname = splitFullNameParts[0];
284-
surnameInRomaji = splitRomajiParts[0];
285-
givenName = splitFullNameParts[1];
286-
givenNameInRomaji = splitRomajiParts[1];
287-
288-
if (Utils.KatakanaRegex.IsMatch(surname))
289-
{
290-
(surname, givenName) = (givenName, surname);
291-
(surnameInRomaji, givenNameInRomaji) = (givenNameInRomaji, surnameInRomaji);
292-
}
293-
}
294-
}
331+
surnameAndNameRecords.Add((surnameRecord, new NameRecord(splitFullNameParts[i], splitFullNameInRomajiParts[i])));
295332
}
296-
else
333+
334+
if (hasMoreThanTwoNameParts)
297335
{
298-
splitFullNameParts = fullName.Split(['・', '・', '・'], StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
299-
if (splitRomajiParts.Length == splitFullNameParts.Length)
336+
int surnameLength = 0;
337+
int surnameInRomajiLength = 0;
338+
for (int i = surnameIndex; i < splitFullNameParts.Length; i++)
300339
{
301-
if (splitFullNameParts.Length is 2)
302-
{
303-
givenName = splitFullNameParts[0];
304-
givenNameInRomaji = splitRomajiParts[0];
305-
surname = splitFullNameParts[1];
306-
surnameInRomaji = splitRomajiParts[1];
307-
308-
if (!Utils.KatakanaRegex.IsMatch(surname) && !Utils.KatakanaRegex.IsMatch(givenName))
309-
{
310-
(givenName, surname) = (surname, givenName);
311-
(givenNameInRomaji, surnameInRomaji) = (surnameInRomaji, givenNameInRomaji);
312-
}
313-
}
340+
surnameLength += splitFullNameParts[i].Length;
341+
surnameInRomajiLength += splitFullNameInRomajiParts[i].Length;
314342
}
343+
344+
int offset = splitFullNameParts.Length - surnameIndex - 1;
345+
int surnameStartIndexForFullName = fullName.Length - surnameLength - offset;
346+
int surnameStartIndexForFullNameInRomaji = fullNameInRomaji.Length - surnameInRomajiLength - offset;
347+
348+
NameRecord fullSurnameRecord = new(string.Join("", fullName[surnameStartIndexForFullName..].Split()), fullNameInRomaji[surnameStartIndexForFullNameInRomaji..]);
349+
NameRecord fullGivenNameRecord = new(string.Join("", fullName[..(surnameStartIndexForFullName - 1)].Split()), fullNameInRomaji[..(surnameStartIndexForFullNameInRomaji - 1)]);
350+
surnameAndNameRecords.Add((fullSurnameRecord, fullGivenNameRecord));
351+
}
352+
353+
return surnameAndNameRecords;
354+
}
355+
}
356+
357+
private static int FindSurnameStartIndex(string[] splitFullNameParts, string[] splitFullNameInRomajiParts)
358+
{
359+
KeyValuePair<string, string[]>[] surnamePrefixes =
360+
[
361+
new("von", ["フォン", "ファン"]),
362+
new("van", ["ヴァン", "ファン"]),
363+
new("de", ["ド", "デ", "ダ"]),
364+
new("du", ["デュ"]),
365+
new("di", ["ディ"]),
366+
new("le", ["ル"]),
367+
new("la", ["ラ"])
368+
];
369+
370+
int index = splitFullNameParts.Length - 1;
371+
foreach (KeyValuePair<string, string[]> surnamePerfix in surnamePrefixes)
372+
{
373+
KeyValuePair<string, string[]> perfix = surnamePerfix;
374+
int tempIndex = Array.FindIndex(splitFullNameInRomajiParts, 1, splitFullNameParts.Length - 2, r => r.Equals(perfix.Key, StringComparison.OrdinalIgnoreCase));
375+
if (tempIndex > 0 && index > tempIndex && surnamePerfix.Value.Contains(splitFullNameParts[tempIndex]))
376+
{
377+
index = tempIndex;
315378
}
316379
}
317380

318-
return givenName is not null && givenNameInRomaji is not null && surname is not null && surnameInRomaji is not null
319-
? (surnameRecord: new NameRecord(surname, surnameInRomaji), givenNameRecord: new NameRecord(givenName, givenNameInRomaji))
320-
: null;
381+
return index;
321382
}
322383
}

VndbCharacterNames/Utils.cs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,24 @@ internal static partial class Utils
1515
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping
1616
};
1717

18-
[GeneratedRegex(@"^[\u30A0-\u30FF]+$", RegexOptions.CultureInvariant)]
18+
[GeneratedRegex(@"[\u30A0-\u30FF\uFF66-\uFF9D]", RegexOptions.CultureInvariant)]
1919
public static partial Regex KatakanaRegex { get; }
2020

21+
[GeneratedRegex(@"[\u3040-\u309F]", RegexOptions.CultureInvariant)]
22+
public static partial Regex HiraganaRegex { get; }
23+
2124
[GeneratedRegex(@"^Full name:.*\n((Sex:.*\n)|)", RegexOptions.CultureInvariant)]
2225
public static partial Regex FullNameAndSexRegex { get; }
2326

2427
[GeneratedRegex(@"[\u00D7\u2000-\u206F\u25A0-\u25FF\u2E80-\u319F\u31C0-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F\uFF00-\uFFEF]|\uD82C[\uDC00-\uDD6F]|\uD83C[\uDE00-\uDEFF]|\uD840[\uDC00-\uDFFF]|[\uD841-\uD868][\uDC00-\uDFFF]|\uD869[\uDC00-\uDEDF]|\uD869[\uDF00-\uDFFF]|[\uD86A-\uD87A][\uDC00-\uDFFF]|\uD87B[\uDC00-\uDE5F]|\uD87E[\uDC00-\uDE1F]|\uD880[\uDC00-\uDFFF]|[\uD881-\uD887][\uDC00-\uDFFF]|\uD888[\uDC00-\uDFAF]", RegexOptions.CultureInvariant)]
2528
public static partial Regex JapaneseRegex { get; }
2629

27-
[GeneratedRegex(@"[\u0000-\u024F\u1E00-\u1EFF\u2C60-\u2C7F]", RegexOptions.CultureInvariant)]
30+
[GeneratedRegex(@"[\u0000-\u024F\u1E00-\u1EFF\u2C60-\u2C7F\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A]", RegexOptions.CultureInvariant)]
2831
public static partial Regex LatinRegex { get; }
2932

33+
[GeneratedRegex(@"[\u2E80-\u2FDF\u3190-\u319F\u3200-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F]|\uD83C[\uDE00-\uDEFF]|\uD840[\uDC00-\uDFFF]|[\uD841-\uD868][\uDC00-\uDFFF]|\uD869[\uDC00-\uDEDF]|\uD869[\uDF00-\uDFFF]|[\uD86A-\uD87A][\uDC00-\uDFFF]|\uD87B[\uDC00-\uDE5F]|\uD87E[\uDC00-\uDE1F]|\uD880[\uDC00-\uDFFF]|[\uD881-\uD887][\uDC00-\uDFFF]|\uD888[\uDC00-\uDFAF]", RegexOptions.CultureInvariant)]
34+
public static partial Regex KanjiRegex { get; }
35+
3036
[GeneratedRegex(@"(\S+)\s*\(([^,]+?)\)", RegexOptions.CultureInvariant)]
3137
public static partial Regex NameInParentheses { get; }
3238
}

0 commit comments

Comments
 (0)