Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse embedded metadata in PDF files #3108

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions API.Tests/Services/BookServiceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,23 @@ public void ShouldParseAsVolumeGroup_WithSeriesIndex()
Assert.Equal("Accel World", comicInfo.Series);
}

[Fact]
microtherion marked this conversation as resolved.
Show resolved Hide resolved
public void ShouldHaveComicInfoForPDF()
{
var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/BookService");
var document = Path.Join(testDirectory, "test.pdf");
var comicInfo = _bookService.GetComicInfo(document);
Assert.NotNull(comicInfo);
Assert.Equal("Variations Chromatiques de concert", comicInfo.Title);
Assert.Equal("Georges Bizet \\(1838-1875\\)", comicInfo.Writer);
}

[Fact]
public void MissingPDFMetadata()
{
var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/ScannerService");
var document = Path.Join(testDirectory, "Rollo at Work SP01.pdf");
var comicInfo = _bookService.GetComicInfo(document);
Assert.Null(comicInfo);
}
}
226 changes: 226 additions & 0 deletions API/Helpers/PdfMetadataExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
using System;
using System.Xml;
using System.Text;
using System.IO;
using API.Data.Metadata;
using API.Entities.Enums;
using API.Services;
using API.Services.Tasks.Scanner.Parser;
using Microsoft.Extensions.Logging;
using Nager.ArticleNumber;

namespace API.Helpers;
#nullable enable

public interface IPdfMetadataExtractor
{
ComicInfo? GetComicInfo(string filePath);
}

public class PdfMetadataExtractor : IPdfMetadataExtractor
{
private readonly ILogger<BookService> _logger;
private readonly IMediaErrorService _mediaErrorService;

public PdfMetadataExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
{
_logger = logger;
_mediaErrorService = mediaErrorService;
}

private int FindInBuffer(byte[] buffer, int bufLen, byte[] match)
{
var maxPos = bufLen - match.Length;
for (var pos = 0; pos <= maxPos; ++pos)
{
var found = true;
for (var ch = 0; ch < match.Length; ++ch)
{
if (buffer[pos+ch] != match[ch])
{
found = false;
break;
}
}
if (found)
{
return pos;
}
}
return -1;
}

private string? GetTextFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
{
return (doc.DocumentElement?.SelectSingleNode(path + "//rdf:li", ns)
?? doc.DocumentElement?.SelectSingleNode(path, ns))?.InnerText;
}

private float? GetFloatFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
{
var text = GetTextFromXmlNode(doc, ns, path);
if (string.IsNullOrEmpty(text)) return null;

return float.Parse(text);
}

private string? GetListFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
{
var nodes = doc.DocumentElement?.SelectNodes(path+"//rdf:li", ns);
if (nodes == null) return null;

var list = new StringBuilder();
microtherion marked this conversation as resolved.
Show resolved Hide resolved

foreach (XmlNode n in nodes)
microtherion marked this conversation as resolved.
Show resolved Hide resolved
{
if (list.Length > 0)
{
list.Append(",");
}
list.Append(n.InnerText);
}

return list.Length > 0 ? list.ToString() : null;
}

private DateTime? GetDateTimeFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
{
var text = GetTextFromXmlNode(doc, ns, path);
if (text == null) return null;

return DateTime.Parse(text);
}

private ComicInfo? GetComicInfoFromMetadata(string metadata, string filePath)
{
var metaDoc = new XmlDocument();
metaDoc.LoadXml(metadata);

var ns = new XmlNamespaceManager(metaDoc.NameTable);
ns.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
ns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/");
ns.AddNamespace("calibreSI", "http://calibre-ebook.com/xmp-namespace-series-index");
ns.AddNamespace("calibre", "http://calibre-ebook.com/xmp-namespace");
ns.AddNamespace("pdfx", "http://ns.adobe.com/pdfx/1.3/");
ns.AddNamespace("prism", "http://prismstandard.org/namespaces/basic/2.0/");
ns.AddNamespace("xmp", "http://ns.adobe.com/xap/1.0/");

var info = new ComicInfo();
var publicationDate = GetDateTimeFromXmlNode(metaDoc, ns, "//dc:date")
?? GetDateTimeFromXmlNode(metaDoc, ns, "//xmp:createdate");

if (publicationDate != null)
microtherion marked this conversation as resolved.
Show resolved Hide resolved
{
info.Year = publicationDate.Value.Year;
info.Month = publicationDate.Value.Month;
info.Day = publicationDate.Value.Day;
}

info.Summary = GetTextFromXmlNode(metaDoc, ns, "//dc:description") ?? String.Empty;
info.Publisher = GetTextFromXmlNode(metaDoc, ns, "//dc:publisher") ?? String.Empty;
info.Writer = GetListFromXmlNode(metaDoc, ns, "//dc:creator") ?? String.Empty;
info.Title = GetTextFromXmlNode(metaDoc, ns, "//dc:title") ?? String.Empty;
info.Genre = GetListFromXmlNode(metaDoc, ns, "//dc:subject") ?? String.Empty;
info.LanguageISO = BookService.ValidateLanguage(GetTextFromXmlNode(metaDoc, ns, "//dc:language"));

info.Isbn = GetTextFromXmlNode(metaDoc, ns, "//pdfx:isbn") ?? GetTextFromXmlNode(metaDoc, ns, "//prism:isbn") ?? String.Empty;

if (!ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
{
_logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
info.Isbn = String.Empty;
}

info.UserRating = GetFloatFromXmlNode(metaDoc, ns, "//calibre:rating") ?? 0.0f;
info.TitleSort = GetTextFromXmlNode(metaDoc, ns, "//calibre:title_sort") ?? String.Empty;
info.Series = GetTextFromXmlNode(metaDoc, ns, "//calibre:series/rdf:value") ?? String.Empty;
info.SeriesSort = info.Series;
info.Volume = Convert.ToInt32(GetFloatFromXmlNode(metaDoc, ns, "//calibreSI:series_index") ?? 0.0f).ToString();

// If this is a single book and not a collection, set publication status to Completed
if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
{
info.Count = 1;
}

var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
.Equals(Parser.LooseLeafVolume);

if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
{
// This is likely a light novel for which we can set series from parsed title
info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
}

ComicInfo.CleanComicInfo(info);

return info;
}

public ComicInfo? GetComicInfo(string filePath)
{
try
{
const int chunkSize = 4096;
const int overlap = 16;
const string startTag = "<x:xmpmeta"; // No closing bracket: there generally are attributes
const string endTag = "</x:xmpmeta>";
var stream = File.OpenRead(filePath);
var buffer = new byte[chunkSize + overlap];
var bytesAvailable = 0;
var hasMetadata = false;
var meta = new byte[0];

while (!hasMetadata)
{
var bytesRead = stream.Read(buffer, bytesAvailable, chunkSize);
if (bytesRead == 0) break;
microtherion marked this conversation as resolved.
Show resolved Hide resolved

bytesAvailable += bytesRead;
var found = FindInBuffer(buffer, bytesAvailable, Encoding.UTF8.GetBytes(startTag));

if (found >= 0)
{
meta = buffer[found..bytesAvailable];
hasMetadata = true;
break;
}

var ovl = Math.Min(overlap, bytesAvailable);
Buffer.BlockCopy(buffer, bytesAvailable - ovl, buffer, 0, ovl);
bytesAvailable = ovl;
}

while ((bytesAvailable = stream.Read(buffer, 0, chunkSize)) > 0 || hasMetadata)
{
hasMetadata = false;

if (bytesAvailable > 0) {
byte[] newMeta = new byte[meta.Length + bytesAvailable];
Buffer.BlockCopy(meta, 0, newMeta, 0, meta.Length);
Buffer.BlockCopy(buffer, 0, newMeta, meta.Length, bytesAvailable);
meta = newMeta;
}

var found = FindInBuffer(meta, meta.Length, Encoding.UTF8.GetBytes(endTag));

if (found >= 0)
{
var metadataEnd = found + endTag.Length;
var metadata = meta[0..metadataEnd];
return GetComicInfoFromMetadata(Encoding.UTF8.GetString(metadata), filePath);
}
}
return null; // No metadata at all, or missing end tag
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata");
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception parsing PDF metadata", ex);
}

return null;
}
}
27 changes: 22 additions & 5 deletions API/Services/BookService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Xml;
using API.Data.Metadata;
using API.DTOs.Reader;
using API.Entities;
using API.Entities.Enums;
using API.Extensions;
using API.Services.Tasks.Scanner.Parser;
using API.Helpers;
using Docnet.Core;
using Docnet.Core.Converters;
using Docnet.Core.Models;
Expand Down Expand Up @@ -69,6 +71,8 @@ public class BookService : IBookService
private static readonly RecyclableMemoryStreamManager StreamManager = new ();
private const string CssScopeClass = ".book-content";
private const string BookApiUrl = "book-resources?file=";
private readonly PdfMetadataExtractor _pdfMetadataExtractor;

public static readonly EpubReaderOptions BookReaderOptions = new()
{
PackageReaderOptions = new PackageReaderOptions
Expand All @@ -84,6 +88,7 @@ public BookService(ILogger<BookService> logger, IDirectoryService directoryServi
_directoryService = directoryService;
_imageService = imageService;
_mediaErrorService = mediaErrorService;
_pdfMetadataExtractor = new PdfMetadataExtractor(_logger, _mediaErrorService);
}

private static bool HasClickableHrefPart(HtmlNode anchor)
Expand Down Expand Up @@ -425,10 +430,8 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer
}
}

public ComicInfo? GetComicInfo(string filePath)
private ComicInfo? GetEpubComicInfo(string filePath)
{
if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null;

try
{
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
Expand All @@ -442,7 +445,7 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer
var (year, month, day) = GetPublicationDate(publicationDate);

var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault();
var info = new ComicInfo
var info = new ComicInfo
{
Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description,
Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)),
Expand Down Expand Up @@ -583,6 +586,20 @@ await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer
return null;
}

public ComicInfo? GetComicInfo(string filePath)
{
if (!IsValidFile(filePath)) return null;

if (Parser.IsPdf(filePath))
{
return _pdfMetadataExtractor.GetComicInfo(filePath);
}
else
{
return GetEpubComicInfo(filePath);
}
}

private static void ExtractSortTitle(EpubMetadataMeta metadataItem, EpubBookRef epubBook, ComicInfo info)
{
var titleId = metadataItem.Refines?.Replace("#", string.Empty);
Expand Down Expand Up @@ -685,7 +702,7 @@ private static (int year, int month, int day) GetPublicationDate(string? publica
return (year, month, day);
}

private static string ValidateLanguage(string? language)
public static string ValidateLanguage(string? language)
{
if (string.IsNullOrEmpty(language)) return string.Empty;

Expand Down
2 changes: 1 addition & 1 deletion API/Services/ReadingItemService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public ReadingItemService(IArchiveService archiveService, IBookService bookServi
/// <returns></returns>
public ComicInfo? GetComicInfo(string filePath)
{
if (Parser.IsEpub(filePath))
if (Parser.IsEpub(filePath) || Parser.IsPdf(filePath))
{
return _bookService.GetComicInfo(filePath);
}
Expand Down
3 changes: 3 additions & 0 deletions API/Services/Tasks/Scanner/Parser/PdfParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ public override ParserInfo Parse(string filePath, string rootPath, string librar
ParseFromFallbackFolders(filePath, rootPath, type, ref ret);
}

// Patch in other information from ComicInfo
UpdateFromComicInfo(ret);

if (ret.Chapters == Parser.DefaultChapter && ret.Volumes == Parser.LooseLeafVolume && type == LibraryType.Book)
{
ret.IsSpecial = true;
Expand Down