From 7f03c39553d62225204747d7c75ebd93aaca5c75 Mon Sep 17 00:00:00 2001 From: Peter van der Weerd Date: Mon, 13 Apr 2020 15:28:54 +0200 Subject: [PATCH] fix for TIKA-3089 contributed by pvanderweerd Wrapping text in pre-tags instead of p-tags will preserving formatting much better --- .../java/org/apache/tika/parser/csv/TextAndCSVParser.java | 5 ++--- .../src/main/java/org/apache/tika/parser/txt/TXTParser.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java index 36ed1227ae..bebece5f0a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java @@ -231,14 +231,14 @@ private void handleText(Reader reader, Charset charset, private static void handleText(Reader reader, XHTMLContentHandler xhtml) throws SAXException, IOException { - xhtml.startElement("p"); + xhtml.startElement("pre"); char[] buffer = new char[4096]; int n = reader.read(buffer); while (n != -1) { xhtml.characters(buffer, 0, n); n = reader.read(buffer); } - xhtml.endElement("p"); + xhtml.endElement("pre"); } @@ -306,7 +306,6 @@ private CSVParams getOverride(Metadata metadata) { String delimiterString = mediaType.getParameters().get(DELIMITER); if (delimiterString == null) { - return new CSVParams(mediaType, charset); } if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) { return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString)); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 15425d5840..03a9154af9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -97,14 +97,14 @@ public void parse( new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - xhtml.startElement("p"); + xhtml.startElement("pre"); char[] buffer = new char[4096]; int n = reader.read(buffer); while (n != -1) { xhtml.characters(buffer, 0, n); n = reader.read(buffer); } - xhtml.endElement("p"); + xhtml.endElement("pre"); xhtml.endDocument(); }