-
-
Notifications
You must be signed in to change notification settings - Fork 918
Open
Labels
Milestone
Description
Please describe the bug
The encoding of an HTML5 document differs from the encoding of an HTML4 document:
Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML)).encoding == "Shift_JIS"
Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML)).encoding == "UTF-8"
I haven't had time to dig into why this is (and whether it's intended behavior), so I'm opening this issue to look into it later. cc @stevecheckoway
Help us reproduce what you're seeing
#! /usr/bin/env ruby
$: << "lib"
require 'nokogiri'
require_relative 'test/helper'
class Test < Nokogiri::TestCase
describe "document encoding" do
describe "HTML4" do
describe "given a File" do
it "should detect shift_jis" do
assert_equal(
"Shift_JIS",
Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML)).encoding,
)
end
end
describe "given a File and an encoding" do
it "should detect shift_jis" do
assert_equal(
"Shift_JIS",
Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
)
end
end
describe "given a String" do
it "should detect shift_jis" do
# fails
assert_equal(
"Shift_JIS",
Nokogiri::HTML4::Document.parse(File.read(SHIFT_JIS_HTML, encoding: "Shift_JIS")).encoding,
)
end
end
describe "given a String and an encoding" do
it "should detect shift_jis" do
assert_equal(
"Shift_JIS",
Nokogiri::HTML4::Document.parse(File.read(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
)
end
end
end
describe "HTML5" do
describe "given a File" do
it "should detect shift_jis" do
# fails
assert_equal(
"Shift_JIS",
Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML)).encoding,
)
end
end
describe "given a File and an encoding" do
it "should detect shift_jis" do
# errors
assert_equal(
"Shift_JIS",
Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
)
end
end
describe "given a String" do
it "should detect shift_jis" do
# fails
assert_equal(
"Shift_JIS",
Nokogiri::HTML5::Document.parse(File.read(SHIFT_JIS_HTML, encoding: "Shift_JIS")).encoding,
)
end
end
describe "given a String and an encoding" do
it "should detect shift_jis" do
# fails
assert_equal(
"Shift_JIS",
Nokogiri::HTML5::Document.parse(File.read(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
)
end
end
end
end
end
yields
Error:
document encoding::HTML5::given a File and an encoding#test_0001_should detect shift_jis:
TypeError: no implicit conversion of Hash into Integer
/home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5.rb:266:in `read'
/home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5.rb:266:in `read_and_encode'
/home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5/document.rb:119:in `do_parse'
/home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5/document.rb:95:in `parse'
./html5-document-encoding.rb:64:in `block (4 levels) in <class:Test>'
Failure:
document encoding::HTML5::given a File#test_0001_should detect shift_jis [./html5-document-encoding.rb:52]
Minitest::Assertion: Expected: "Shift_JIS"
Actual: "UTF-8"
Failure:
document encoding::HTML5::given a String and an encoding#test_0001_should detect shift_jis [./html5-document-encoding.rb:82]
Minitest::Assertion: Expected: "Shift_JIS"
Actual: "UTF-8"
Failure:
document encoding::HTML5::given a String#test_0001_should detect shift_jis [./html5-document-encoding.rb:72]
Minitest::Assertion: Expected: "Shift_JIS"
Actual: "UTF-8"
Expected behavior
I think these should both be the same?