-
-
Notifications
You must be signed in to change notification settings - Fork 451
PDF Metadata Support #3108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
majora2007
merged 10 commits into
Kareadita:bugfix/polish
from
microtherion:Pdf_Metadata
Feb 16, 2025
Merged
PDF Metadata Support #3108
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
b71b591
Parse embedded metadata in PDF files
microtherion 4ecc7df
Add test case (using existing test data), fix one discovered bug
microtherion 057d5fc
Address smaller PR comments
microtherion 1e28d6d
Refactor PDF metadata extraction
microtherion ee315cf
Address second round of PR review comments
microtherion f75a8dc
Rewrite PDF Metadata extraction, add more tests for indirect referenc…
microtherion 8820277
Refactor dictionary parsing
microtherion e4b86c7
Incorporate PR feedback; add references to PDF Spec
microtherion 068d0fb
We do indeed want TitleSort, not Title
microtherion 8b28ad4
Revert PdfParser change
microtherion File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure. | ||
|
||
// Contributed by https://github.com/microtherion | ||
|
||
// All references to the "PDF Spec" (section numbers, etc) refer to the | ||
// PDF 1.7 Specification a.k.a. PDF32000-1:2008 | ||
// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf | ||
|
||
using System; | ||
using System.Xml; | ||
using System.Text; | ||
using System.IO; | ||
using System.Diagnostics; | ||
using API.Data.Metadata; | ||
using API.Entities.Enums; | ||
using API.Services; | ||
using API.Services.Tasks.Scanner.Parser; | ||
using Microsoft.Extensions.Logging; | ||
using Nager.ArticleNumber; | ||
using System.Collections.Generic; | ||
|
||
namespace API.Helpers; | ||
#nullable enable | ||
|
||
public interface IPdfComicInfoExtractor | ||
{ | ||
ComicInfo? GetComicInfo(string filePath); | ||
} | ||
|
||
public class PdfComicInfoExtractor : IPdfComicInfoExtractor | ||
{ | ||
private readonly ILogger<BookService> _logger; | ||
private readonly IMediaErrorService _mediaErrorService; | ||
private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4 | ||
"D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss", | ||
"D:yyyyMMddHHmmzzz:", "D:yyyyMMddHHmm+", "D:yyyyMMddHHmm", | ||
"D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH", | ||
"D:yyyyMMdd", "D:yyyyMM", "D:yyyy" | ||
]; | ||
|
||
public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService) | ||
{ | ||
_logger = logger; | ||
_mediaErrorService = mediaErrorService; | ||
} | ||
|
||
private float? GetFloatFromText(string? text) | ||
{ | ||
if (string.IsNullOrEmpty(text)) return null; | ||
|
||
if (float.TryParse(text, out var value)) return value; | ||
|
||
return null; | ||
} | ||
|
||
private DateTime? GetDateTimeFromText(string? text) | ||
{ | ||
if (string.IsNullOrEmpty(text)) return null; | ||
|
||
// Dates stored in the XMP metadata stream (PDF Spec 14.3.2) | ||
// are stored in ISO 8601 format, which is handled by C# out of the box | ||
if (DateTime.TryParse(text, out var date)) return date; | ||
|
||
// Dates stored in the document information directory (PDF Spec 14.3.3) | ||
// are stored in a proprietary format (PDF Spec 7.9.4) that needs to be | ||
// massaged slightly to be expressible by a DateTime format. | ||
if (text[0] != 'D') { | ||
text = "D:" + text; | ||
} | ||
text = text.Replace("'", ":"); | ||
text = text.Replace("Z", "+"); | ||
|
||
foreach(var format in _pdfDateFormats) | ||
{ | ||
if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate; | ||
} | ||
|
||
return null; | ||
} | ||
|
||
private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key) | ||
{ | ||
return metadata.ContainsKey(key) ? metadata[key] : null; | ||
} | ||
|
||
private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath) | ||
{ | ||
var info = new ComicInfo(); | ||
|
||
var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate")); | ||
|
||
if (publicationDate != null) | ||
{ | ||
info.Year = publicationDate.Value.Year; | ||
info.Month = publicationDate.Value.Month; | ||
info.Day = publicationDate.Value.Day; | ||
} | ||
|
||
info.Summary = MaybeGetMetadata(metadata, "Summary") ?? string.Empty; | ||
info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty; | ||
info.Writer = MaybeGetMetadata(metadata, "Author") ?? string.Empty; | ||
info.Title = MaybeGetMetadata(metadata, "Title") ?? string.Empty; | ||
info.Genre = MaybeGetMetadata(metadata, "Subject") ?? string.Empty; | ||
info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language")); | ||
info.Isbn = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty; | ||
|
||
if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn)) | ||
{ | ||
_logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath); | ||
majora2007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
info.Isbn = string.Empty; | ||
} | ||
|
||
info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f; | ||
info.TitleSort = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty; | ||
info.Series = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort; | ||
info.SeriesSort = info.Series; | ||
info.Volume = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString(); | ||
|
||
// If this is a single book and not a collection, set publication status to Completed | ||
if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume)) | ||
{ | ||
info.Count = 1; | ||
} | ||
|
||
// Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782 | ||
// | ||
// var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga) | ||
// .Equals(Parser.LooseLeafVolume); | ||
|
||
// if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series))) | ||
// { | ||
// // This is likely a light novel for which we can set series from parsed title | ||
// info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga); | ||
// info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga); | ||
// } | ||
|
||
ComicInfo.CleanComicInfo(info); | ||
|
||
return info; | ||
} | ||
|
||
public ComicInfo? GetComicInfo(string filePath) | ||
{ | ||
try | ||
{ | ||
var extractor = new PdfMetadataExtractor(_logger, filePath); | ||
|
||
return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath); | ||
} | ||
catch (Exception ex) | ||
{ | ||
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath); | ||
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, | ||
"There was an exception parsing PDF metadata", ex); | ||
} | ||
|
||
return null; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.