Skip to content

Commit

Permalink
Apertium []❡ header marker via -H, currently only for HTML (issue #15)
Browse files Browse the repository at this point in the history
  • Loading branch information
TinoDidriksen committed Jan 25, 2024
1 parent ef432b0 commit 34d9672
Show file tree
Hide file tree
Showing 11 changed files with 38 additions and 9 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
project(transfuse
VERSION 0.6.2
VERSION 0.6.3
LANGUAGES CXX C
)

Expand Down
12 changes: 9 additions & 3 deletions src/dom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ void DOM::save_styles(xmlString& s, xmlNodePtr dom, size_t rn, bool protect) {
}

// Extracts blocks and textual attributes for the stream, and leaves unique markers we can later search/replace
void DOM::extract_blocks(xmlString& s, xmlNodePtr dom, size_t rn, bool txt) {
void DOM::extract_blocks(xmlString& s, xmlNodePtr dom, size_t rn, bool txt, bool header) {
if (dom == nullptr || dom->children == nullptr) {
return;
}
Expand Down Expand Up @@ -528,6 +528,9 @@ void DOM::extract_blocks(xmlString& s, xmlNodePtr dom, size_t rn, bool txt) {

stream->block_open(s, tmp_lxs[2]);
stream->block_body(s, tmp_lxs[1]);
if (attr_headers.count(a)) {
stream->block_term_header(s);
}
stream->block_close(s, tmp_lxs[2]);

tmp_lxs[3] = XC(TFB_OPEN_B);
Expand All @@ -543,10 +546,10 @@ void DOM::extract_blocks(xmlString& s, xmlNodePtr dom, size_t rn, bool txt) {
}

if (tags_parents_allow.count(lname)) {
extract_blocks(s, child, rn + 1, true);
extract_blocks(s, child, rn + 1, true, header || tags_headers.count(lname));
}
else if (child->type == XML_ELEMENT_NODE || child->properties) {
extract_blocks(s, child, rn + 1, txt);
extract_blocks(s, child, rn + 1, txt, header || tags_headers.count(lname));
}
else if (child->content && child->content[0]) {
if (!txt) {
Expand Down Expand Up @@ -579,6 +582,9 @@ void DOM::extract_blocks(xmlString& s, xmlNodePtr dom, size_t rn, bool txt) {

stream->block_open(s, tmp_lxs[2]);
stream->block_body(s, tmp_lxs[1]);
if (header || tags_headers.count(pname)) {
stream->block_term_header(s);
}
stream->block_close(s, tmp_lxs[2]);

tmp_lxs[3] = XC(TFB_OPEN_B);
Expand Down
4 changes: 3 additions & 1 deletion src/dom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ struct DOM {
xmlChars tags_inline; // Inline tags
xmlChars tags_parents_allow; // If set, only extract children of these tags
xmlChars tags_parents_direct; // Used for TTX <df>?
xmlChars tags_headers; // Tags that should append ❡ (U+2761)
xmlChars tag_attrs; // Attributes that should also be extracted
xmlChars attr_headers; // Attributes that should append ❡ (U+2761)

DOM(State&, xmlDocPtr);
~DOM();
Expand Down Expand Up @@ -189,7 +191,7 @@ struct DOM {
return rv;
}

void extract_blocks(xmlString&, xmlNodePtr, size_t, bool txt = false);
void extract_blocks(xmlString&, xmlNodePtr, size_t, bool txt = false, bool header = false);
xmlString extract_blocks() {
xmlString rv;
stream->stream_header(rv, state.tmpdir);
Expand Down
3 changes: 2 additions & 1 deletion src/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

namespace Transfuse {

fs::path extract(fs::path tmpdir, fs::path infile, std::string_view format, Stream stream, bool wipe) {
fs::path extract(fs::path tmpdir, fs::path infile, std::string_view format, Stream stream, bool wipe, bool mark_headers) {
if (stream == Streams::detect) {
stream = Streams::apertium;
}
Expand Down Expand Up @@ -195,6 +195,7 @@ fs::path extract(fs::path tmpdir, fs::path infile, std::string_view format, Stre

state->format(format);
state->stream(stream);
state->opt_mark_headers = mark_headers;

if (format == "docx") {
dom = extract_docx(*state);
Expand Down
4 changes: 4 additions & 0 deletions src/format-html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ std::unique_ptr<DOM> extract_html(State& state, std::unique_ptr<icu::UnicodeStri
dom->tags_raw = make_xmlChars("script", "style", "svg");
dom->tags_inline = make_xmlChars("a", "abbr", "acronym", "address", "b", "bdi", "bdo", "big", "del", "em", "font", "i", "ins", "kbd", "mark", "meter", "output", "q", "s", "samp", "small", "span", "strike", "strong", "sub", "sup", "time", "tt", "u", "var");
dom->tag_attrs = make_xmlChars("alt", "caption", "label", "summary", "title", "placeholder");
if (state.opt_mark_headers) {
dom->tags_headers = make_xmlChars("h1", "h2", "h3", "h4", "h5", "h6");
dom->attr_headers = make_xmlChars("title");
}
dom->save_spaces();

auto styled = dom->save_styles(true);
Expand Down
2 changes: 2 additions & 0 deletions src/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ constexpr auto XML_ENC_UC = static_cast<UChar>(u'\uE014');
#define TFU_OPEN "\xee\x80\xa2" /* \uE022 */
#define TFU_CLOSE "\xee\x80\xa3" /* \uE023 */

#define TF_CURVED_PARAGRAPH "\xe2\x9d\xa1" /* \u2761 ❡ CURVED STEM PARAGRAPH SIGN ORNAMENT */

#if defined(ARCH_BIG_ENDIAN)
const std::string_view utf16_bom{ "\xfe\xff" };
const auto utf16_native = "UTF-16BE";
Expand Down
1 change: 1 addition & 0 deletions src/state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ struct State {
fs::path tmpdir;
bool opt_verbose = false;
bool opt_debug = false;
bool opt_mark_headers = false;

State(fs::path, bool ro = false);
~State();
Expand Down
5 changes: 5 additions & 0 deletions src/stream-apertium.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,11 @@ void ApertiumStream::block_body(xmlString& s, xmlChar_view xc) {
escape_body(s, xc);
}

void ApertiumStream::block_term_header(xmlString& s) {
s += "[]";
s += TF_CURVED_PARAGRAPH;
}

void ApertiumStream::block_close(xmlString& s, xmlChar_view) {
s += ".[]\n";
s.push_back('\0');
Expand Down
4 changes: 4 additions & 0 deletions src/stream-visl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ void VISLStream::block_body(xmlString& s, xmlChar_view xc) {
escape_body(s, xc);
}

void VISLStream::block_term_header(xmlString& s) {
(void)s;
}

void VISLStream::block_close(xmlString& s, xmlChar_view) {
s += "\n</s>\n\n";
}
Expand Down
3 changes: 3 additions & 0 deletions src/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct StreamBase {
virtual void stream_header(xmlString&, fs::path) = 0;
virtual void block_open(xmlString&, xmlChar_view) = 0;
virtual void block_body(xmlString&, xmlChar_view) = 0;
virtual void block_term_header(xmlString&) = 0;
virtual void block_close(xmlString&, xmlChar_view) = 0;

// Input functions
Expand All @@ -59,6 +60,7 @@ struct ApertiumStream final : StreamBase {
void stream_header(xmlString&, fs::path) final;
void block_open(xmlString&, xmlChar_view) final;
void block_body(xmlString&, xmlChar_view) final;
void block_term_header(xmlString&) final;
void block_close(xmlString&, xmlChar_view) final;

// Input functions
Expand All @@ -77,6 +79,7 @@ struct VISLStream final : StreamBase {
void stream_header(xmlString&, fs::path) final;
void block_open(xmlString&, xmlChar_view) final;
void block_body(xmlString&, xmlChar_view) final;
void block_term_header(xmlString&) final;
void block_close(xmlString&, xmlChar_view) final;

// Input functions
Expand Down
7 changes: 4 additions & 3 deletions src/transfuse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ using namespace icu;

namespace Transfuse {

fs::path extract(fs::path tmpdir, fs::path infile, std::string_view format, Stream stream, bool wipe);
fs::path extract(fs::path tmpdir, fs::path infile, std::string_view format, Stream stream, bool wipe, bool mark_headers = false);
std::pair<fs::path, std::string> inject(fs::path tmpdir, std::istream& in, Stream stream);

std::istream* read_or_stdin(const char* arg, std::unique_ptr<std::istream>& in) {
Expand Down Expand Up @@ -86,6 +86,7 @@ int main(int argc, char* argv[]) {
O('K', "no-keep", ARG_NO, "recreate state folder before extraction and delete it after injection"),
O('i', "input", ARG_REQ, "input file, if not passed as arg; default and - is stdin"),
O('o', "output", ARG_REQ, "output file, if not passed as arg; default and - is stdout"),
O('H', "mark-headers", ARG_NO, "output U+2761 after headers, such as HTML tags h1-h6 and attribute 'title'"),
O('V', "version", ARG_NO, "output version information"),
// Options after final() are still usable, but not shown in --help
final(),
Expand Down Expand Up @@ -220,7 +221,7 @@ int main(int argc, char* argv[]) {

if (mode == "clean") {
// Extracts and immediately injects again - useful for cleaning documents for other CAT tools, such as OmegaT
tmpdir = extract(tmpdir, infile, format, stream, opts["no-keep"] != nullptr);
tmpdir = extract(tmpdir, infile, format, stream, opts["no-keep"] != nullptr, opts["mark-headers"] != nullptr);
in = read_or_stdin("extracted", _in);
auto rv = inject(tmpdir, *in, stream);
std::ifstream data(rv.second, std::ios::binary);
Expand All @@ -230,7 +231,7 @@ int main(int argc, char* argv[]) {
tmpdir = rv.first;
}
else if (mode == "extract") {
tmpdir = extract(tmpdir, infile, format, stream, opts["no-keep"] != nullptr);
tmpdir = extract(tmpdir, infile, format, stream, opts["no-keep"] != nullptr, opts["mark-headers"] != nullptr);
std::ifstream data("extracted", std::ios::binary);
data.exceptions(std::ios::badbit | std::ios::failbit);
(*out) << data.rdbuf();
Expand Down

0 comments on commit 34d9672

Please sign in to comment.