-
Notifications
You must be signed in to change notification settings - Fork 210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
XHTML content is parsed out of order? #275
Comments
Hey @talonx! I'm here to help you with bug fixes, questions, and contributions. Let's tackle this issue together while we wait for a human maintainer. To resolve the issue of your custom RSS parser outputting jumbled and out-of-order XHTML content, you should ensure that the parser processes the text nodes correctly and maintains the order of elements. Here are a few steps you can take:
Here is a code snippet that highlights how to handle text nodes and maintain the order of elements: function closeText(parser) {
parser.textNode = textopts(parser.opt, parser.textNode);
if (parser.textNode) emit(parser, 'ontext', parser.textNode);
parser.textNode = '';
}
function emitNode(parser, nodeType, data) {
if (parser.textNode) closeText(parser);
emit(parser, nodeType, data);
}
function processAsync() {
var chunk, err;
try {
if (this.remaining.length <= this.options.chunkSize) {
chunk = this.remaining;
this.remaining = '';
this.saxParser = this.saxParser.write(chunk);
return this.saxParser.close();
} else {
chunk = this.remaining.substr(0, this.options.chunkSize);
this.remaining = this.remaining.substr(this.options.chunkSize, this.remaining.length);
this.saxParser = this.saxParser.write(chunk);
return setImmediate(this.processAsync.bind(this));
}
} catch (error1) {
err = error1;
if (!this.saxParser.errThrown) {
this.saxParser.errThrown = true;
return this.emit(err);
}
}
}
Parser.prototype.reset = function () {
var attrkey, charkey, ontext, stack;
this.removeAllListeners();
this.saxParser = sax.parser(this.options.strict, {
trim: false,
normalize: false,
xmlns: this.options.xmlns
});
this.saxParser.errThrown = false;
this.saxParser.onerror = function (_this) {
return function (error) {
_this.saxParser.resume();
if (!_this.saxParser.errThrown) {
_this.saxParser.errThrown = true;
return _this.emit("error", error);
}
};
}(this);
this.saxParser.onend = function (_this) {
return function () {
if (!_this.saxParser.ended) {
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
this.saxParser.ended = false;
this.EXPLICIT_CHARKEY = this.options.explicitCharkey;
this.resultObject = null;
stack = [];
attrkey = this.options.attrkey;
charkey = this.options.charkey;
this.saxParser.onopentag = function (_this) {
return function (node) {
var key, newValue, obj, processedKey, ref;
obj = Object.create(null);
obj[charkey] = "";
if (!_this.options.ignoreAttrs) {
ref = node.attributes;
for (key in ref) {
if (!hasProp.call(ref, key)) continue;
if (!(attrkey in obj) && !_this.options.mergeAttrs) {
obj[attrkey] = Object.create(null);
}
newValue = _this.options.attrValueProcessors ? processItem(_this.options.attrValueProcessors, node.attributes[key], key) : node.attributes[key];
processedKey = _this.options.attrNameProcessors ? processItem(_this.options.attrNameProcessors, key) : key;
if (_this.options.mergeAttrs) {
_this.assignOrPush(obj, processedKey, newValue);
} else {
obj[attrkey][processedKey] = newValue;
}
}
}
obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
if (_this.options.xmlns) {
obj[_this.options.xmlnskey] = {
uri: node.uri,
local: node.local
};
}
return stack.push(obj);
};
}(this);
this.saxParser.onclosetag = function (_this) {
return function () {
var cdata, emptyStr, key, node, nodeName, obj, objClone, old, s, xpath;
obj = stack.pop();
nodeName = obj["#name"];
if (!_this.options.explicitChildren || !_this.options.preserveChildrenOrder) {
delete obj["#name"];
}
if (obj.cdata === true) {
cdata = obj.cdata;
delete obj.cdata;
}
s = stack[stack.length - 1];
if (obj[charkey].match(/^\s*$/) && !cdata) {
emptyStr = obj[charkey];
delete obj[charkey];
} else {
if (_this.options.trim) {
obj[charkey] = obj[charkey].trim();
}
if (_this.options.normalize) {
obj[charkey] = obj[charkey].replace(/\s{2,}/g, " ").trim();
}
obj[charkey] = _this.options.valueProcessors ? processItem(_this.options.valueProcessors, obj[charkey], nodeName) : obj[charkey];
if (Object.keys(obj).length === 1 && charkey in obj && !_this.EXPLICIT_CHARKEY) {
obj = obj[charkey];
}
}
if (isEmpty(obj)) {
if (typeof _this.options.emptyTag === 'function') {
obj = _this.options.emptyTag();
} else {
obj = _this.options.emptyTag !== '' ? _this.options.emptyTag : emptyStr;
}
}
if (_this.options.validator != null) {
xpath = "/" + function () {
var i, len, results;
results = [];
for (i = 0, len = stack.length; i < len; i++) {
results.push(stack[i]["#name"]);
}
return results.join("/");
}();
obj = _this.options.validator(xpath, s, obj);
}
if (s) {
_this.assignOrPush(s, nodeName, obj);
} else {
_this.resultObject = obj;
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
ontext = function (_this) {
return function (text) {
var charChild, s;
s = stack[stack.length - 1];
if (s) {
s[charkey] += text;
if (_this.options.explicitChildren && _this.options.preserveChildrenOrder && _this.options.charsAsChildren && (_this.options.includeWhiteChars || text.replace(/\n/g, '').trim() !== '')) {
s[_this.options.childkey] = s[_this.options.childkey] || [];
charChild = {
'#name': '__text__'
};
charChild[charkey] = text;
if (_this.options.normalize) {
charChild[charkey] = charChild[charkey].replace(/\s{2,}/g, " ").trim();
}
s[_this.options.childkey].push(charChild);
}
return s;
}
};
}(this);
this.saxParser.ontext = ontext;
return this.saxParser.oncdata = function (_this) {
return function (text) {
var s;
s = ontext(text);
if (s) {
return s.cdata = true;
}
};
}(this);
}; This code ensures that text nodes are closed properly and that the order of elements is maintained during parsing. Adjust the |
hetzner.txt
After parsing when I attempt to access the content using
the content is mixed up like this
Notice the last two "in_progress"es squashed together, and the dates also.
The actual content looks like this (see attached file)
My parser is defined with a custom field as
The text was updated successfully, but these errors were encountered: