diff --git a/API.Tests/Services/BookServiceTests.cs b/API.Tests/Services/BookServiceTests.cs index e4647524e..a056aae4c 100644 --- a/API.Tests/Services/BookServiceTests.cs +++ b/API.Tests/Services/BookServiceTests.cs @@ -81,4 +81,23 @@ public void ShouldParseAsVolumeGroup_WithSeriesIndex() Assert.Equal("Accel World", comicInfo.Series); } + [Fact] + public void ShouldHaveComicInfoForPDF() + { + var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/BookService"); + var document = Path.Join(testDirectory, "test.pdf"); + var comicInfo = _bookService.GetComicInfo(document); + Assert.NotNull(comicInfo); + Assert.Equal("Variations Chromatiques de concert", comicInfo.Title); + Assert.Equal("Georges Bizet \\(1838-1875\\)", comicInfo.Writer); + } + + [Fact] + public void MissingPDFMetadata() + { + var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/ScannerService"); + var document = Path.Join(testDirectory, "Rollo at Work SP01.pdf"); + var comicInfo = _bookService.GetComicInfo(document); + Assert.Null(comicInfo); + } } diff --git a/API/Helpers/PdfMetadataExtractor.cs b/API/Helpers/PdfMetadataExtractor.cs new file mode 100644 index 000000000..658b0e6b2 --- /dev/null +++ b/API/Helpers/PdfMetadataExtractor.cs @@ -0,0 +1,226 @@ +using System; +using System.Xml; +using System.Text; +using System.IO; +using API.Data.Metadata; +using API.Entities.Enums; +using API.Services; +using API.Services.Tasks.Scanner.Parser; +using Microsoft.Extensions.Logging; +using Nager.ArticleNumber; + +namespace API.Helpers; +#nullable enable + +public interface IPdfMetadataExtractor +{ + ComicInfo? GetComicInfo(string filePath); +} + +public class PdfMetadataExtractor : IPdfMetadataExtractor +{ + private readonly ILogger _logger; + private readonly IMediaErrorService _mediaErrorService; + + public PdfMetadataExtractor(ILogger logger, IMediaErrorService mediaErrorService) + { + _logger = logger; + _mediaErrorService = mediaErrorService; + } + + private int FindInBuffer(byte[] buffer, int bufLen, byte[] match) + { + var maxPos = bufLen - match.Length; + for (var pos = 0; pos <= maxPos; ++pos) + { + var found = true; + for (var ch = 0; ch < match.Length; ++ch) + { + if (buffer[pos+ch] != match[ch]) + { + found = false; + break; + } + } + if (found) + { + return pos; + } + } + return -1; + } + + private string? GetTextFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path) + { + return (doc.DocumentElement?.SelectSingleNode(path + "//rdf:li", ns) + ?? doc.DocumentElement?.SelectSingleNode(path, ns))?.InnerText; + } + + private float? GetFloatFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path) + { + var text = GetTextFromXmlNode(doc, ns, path); + if (string.IsNullOrEmpty(text)) return null; + + return float.Parse(text); + } + + private string? GetListFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path) + { + var nodes = doc.DocumentElement?.SelectNodes(path+"//rdf:li", ns); + if (nodes == null) return null; + + var list = new StringBuilder(); + + foreach (XmlNode n in nodes) + { + if (list.Length > 0) + { + list.Append(","); + } + list.Append(n.InnerText); + } + + return list.Length > 0 ? list.ToString() : null; + } + + private DateTime? GetDateTimeFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path) + { + var text = GetTextFromXmlNode(doc, ns, path); + if (text == null) return null; + + return DateTime.Parse(text); + } + + private ComicInfo? GetComicInfoFromMetadata(string metadata, string filePath) + { + var metaDoc = new XmlDocument(); + metaDoc.LoadXml(metadata); + + var ns = new XmlNamespaceManager(metaDoc.NameTable); + ns.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); + ns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/"); + ns.AddNamespace("calibreSI", "http://calibre-ebook.com/xmp-namespace-series-index"); + ns.AddNamespace("calibre", "http://calibre-ebook.com/xmp-namespace"); + ns.AddNamespace("pdfx", "http://ns.adobe.com/pdfx/1.3/"); + ns.AddNamespace("prism", "http://prismstandard.org/namespaces/basic/2.0/"); + ns.AddNamespace("xmp", "http://ns.adobe.com/xap/1.0/"); + + var info = new ComicInfo(); + var publicationDate = GetDateTimeFromXmlNode(metaDoc, ns, "//dc:date") + ?? GetDateTimeFromXmlNode(metaDoc, ns, "//xmp:createdate"); + + if (publicationDate != null) + { + info.Year = publicationDate.Value.Year; + info.Month = publicationDate.Value.Month; + info.Day = publicationDate.Value.Day; + } + + info.Summary = GetTextFromXmlNode(metaDoc, ns, "//dc:description") ?? String.Empty; + info.Publisher = GetTextFromXmlNode(metaDoc, ns, "//dc:publisher") ?? String.Empty; + info.Writer = GetListFromXmlNode(metaDoc, ns, "//dc:creator") ?? String.Empty; + info.Title = GetTextFromXmlNode(metaDoc, ns, "//dc:title") ?? String.Empty; + info.Genre = GetListFromXmlNode(metaDoc, ns, "//dc:subject") ?? String.Empty; + info.LanguageISO = BookService.ValidateLanguage(GetTextFromXmlNode(metaDoc, ns, "//dc:language")); + + info.Isbn = GetTextFromXmlNode(metaDoc, ns, "//pdfx:isbn") ?? GetTextFromXmlNode(metaDoc, ns, "//prism:isbn") ?? String.Empty; + + if (!ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn)) + { + _logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath); + info.Isbn = String.Empty; + } + + info.UserRating = GetFloatFromXmlNode(metaDoc, ns, "//calibre:rating") ?? 0.0f; + info.TitleSort = GetTextFromXmlNode(metaDoc, ns, "//calibre:title_sort") ?? String.Empty; + info.Series = GetTextFromXmlNode(metaDoc, ns, "//calibre:series/rdf:value") ?? String.Empty; + info.SeriesSort = info.Series; + info.Volume = Convert.ToInt32(GetFloatFromXmlNode(metaDoc, ns, "//calibreSI:series_index") ?? 0.0f).ToString(); + + // If this is a single book and not a collection, set publication status to Completed + if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume)) + { + info.Count = 1; + } + + var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga) + .Equals(Parser.LooseLeafVolume); + + if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series))) + { + // This is likely a light novel for which we can set series from parsed title + info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga); + info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga); + } + + ComicInfo.CleanComicInfo(info); + + return info; + } + + public ComicInfo? GetComicInfo(string filePath) + { + try + { + const int chunkSize = 4096; + const int overlap = 16; + const string startTag = "= 0) + { + meta = buffer[found..bytesAvailable]; + hasMetadata = true; + break; + } + + var ovl = Math.Min(overlap, bytesAvailable); + Buffer.BlockCopy(buffer, bytesAvailable - ovl, buffer, 0, ovl); + bytesAvailable = ovl; + } + + while ((bytesAvailable = stream.Read(buffer, 0, chunkSize)) > 0 || hasMetadata) + { + hasMetadata = false; + + if (bytesAvailable > 0) { + byte[] newMeta = new byte[meta.Length + bytesAvailable]; + Buffer.BlockCopy(meta, 0, newMeta, 0, meta.Length); + Buffer.BlockCopy(buffer, 0, newMeta, meta.Length, bytesAvailable); + meta = newMeta; + } + + var found = FindInBuffer(meta, meta.Length, Encoding.UTF8.GetBytes(endTag)); + + if (found >= 0) + { + var metadataEnd = found + endTag.Length; + var metadata = meta[0..metadataEnd]; + return GetComicInfoFromMetadata(Encoding.UTF8.GetString(metadata), filePath); + } + } + return null; // No metadata at all, or missing end tag + } + catch (Exception ex) + { + _logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata"); + _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, + "There was an exception parsing PDF metadata", ex); + } + + return null; + } +} \ No newline at end of file diff --git a/API/Services/BookService.cs b/API/Services/BookService.cs index e4ed92047..42687b631 100644 --- a/API/Services/BookService.cs +++ b/API/Services/BookService.cs @@ -6,12 +6,14 @@ using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; +using System.Xml; using API.Data.Metadata; using API.DTOs.Reader; using API.Entities; using API.Entities.Enums; using API.Extensions; using API.Services.Tasks.Scanner.Parser; +using API.Helpers; using Docnet.Core; using Docnet.Core.Converters; using Docnet.Core.Models; @@ -69,6 +71,8 @@ public class BookService : IBookService private static readonly RecyclableMemoryStreamManager StreamManager = new (); private const string CssScopeClass = ".book-content"; private const string BookApiUrl = "book-resources?file="; + private readonly PdfMetadataExtractor _pdfMetadataExtractor; + public static readonly EpubReaderOptions BookReaderOptions = new() { PackageReaderOptions = new PackageReaderOptions @@ -84,6 +88,7 @@ public BookService(ILogger logger, IDirectoryService directoryServi _directoryService = directoryService; _imageService = imageService; _mediaErrorService = mediaErrorService; + _pdfMetadataExtractor = new PdfMetadataExtractor(_logger, _mediaErrorService); } private static bool HasClickableHrefPart(HtmlNode anchor) @@ -425,10 +430,8 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer } } - public ComicInfo? GetComicInfo(string filePath) + private ComicInfo? GetEpubComicInfo(string filePath) { - if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null; - try { using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions); @@ -442,7 +445,7 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer var (year, month, day) = GetPublicationDate(publicationDate); var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault(); - var info = new ComicInfo + var info = new ComicInfo { Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description, Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)), @@ -583,6 +586,20 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer return null; } + public ComicInfo? GetComicInfo(string filePath) + { + if (!IsValidFile(filePath)) return null; + + if (Parser.IsPdf(filePath)) + { + return _pdfMetadataExtractor.GetComicInfo(filePath); + } + else + { + return GetEpubComicInfo(filePath); + } + } + private static void ExtractSortTitle(EpubMetadataMeta metadataItem, EpubBookRef epubBook, ComicInfo info) { var titleId = metadataItem.Refines?.Replace("#", string.Empty); @@ -685,7 +702,7 @@ private static (int year, int month, int day) GetPublicationDate(string? publica return (year, month, day); } - private static string ValidateLanguage(string? language) + public static string ValidateLanguage(string? language) { if (string.IsNullOrEmpty(language)) return string.Empty; diff --git a/API/Services/ReadingItemService.cs b/API/Services/ReadingItemService.cs index 34360efa5..f4608626d 100644 --- a/API/Services/ReadingItemService.cs +++ b/API/Services/ReadingItemService.cs @@ -53,7 +53,7 @@ public ReadingItemService(IArchiveService archiveService, IBookService bookServi /// public ComicInfo? GetComicInfo(string filePath) { - if (Parser.IsEpub(filePath)) + if (Parser.IsEpub(filePath) || Parser.IsPdf(filePath)) { return _bookService.GetComicInfo(filePath); } diff --git a/API/Services/Tasks/Scanner/Parser/PdfParser.cs b/API/Services/Tasks/Scanner/Parser/PdfParser.cs index d589a9914..d4dd0f6ea 100644 --- a/API/Services/Tasks/Scanner/Parser/PdfParser.cs +++ b/API/Services/Tasks/Scanner/Parser/PdfParser.cs @@ -62,6 +62,9 @@ public override ParserInfo Parse(string filePath, string rootPath, string librar ParseFromFallbackFolders(filePath, rootPath, type, ref ret); } + // Patch in other information from ComicInfo + UpdateFromComicInfo(ret); + if (ret.Chapters == Parser.DefaultChapter && ret.Volumes == Parser.LooseLeafVolume && type == LibraryType.Book) { ret.IsSpecial = true;