Skip to content

[bug] HTML5 document encoding differs from HTML4 #2801

@flavorjones

Description

@flavorjones

Please describe the bug

The encoding of an HTML5 document differs from the encoding of an HTML4 document:

Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML)).encoding == "Shift_JIS"
Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML)).encoding == "UTF-8"

I haven't had time to dig into why this is (and whether it's intended behavior), so I'm opening this issue to look into it later. cc @stevecheckoway

Help us reproduce what you're seeing

#! /usr/bin/env ruby

$: << "lib"
require 'nokogiri'
require_relative 'test/helper'

class Test < Nokogiri::TestCase
  describe "document encoding" do
    describe "HTML4" do
      describe "given a File" do
        it "should detect shift_jis" do
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML)).encoding,
          )
        end
      end

      describe "given a File and an encoding" do
        it "should detect shift_jis" do
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML4::Document.parse(File.open(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
          )
        end
      end

      describe "given a String" do
        it "should detect shift_jis" do
          # fails
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML4::Document.parse(File.read(SHIFT_JIS_HTML, encoding: "Shift_JIS")).encoding,
          )
        end
      end

      describe "given a String and an encoding" do
        it "should detect shift_jis" do
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML4::Document.parse(File.read(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
          )
        end
      end
    end

    describe "HTML5" do
      describe "given a File" do
        it "should detect shift_jis" do
          # fails
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML)).encoding,
          )
        end
      end

      describe "given a File and an encoding" do
        it "should detect shift_jis" do
          # errors
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML5::Document.parse(File.open(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
          )
        end
      end

      describe "given a String" do
        it "should detect shift_jis" do
          # fails
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML5::Document.parse(File.read(SHIFT_JIS_HTML, encoding: "Shift_JIS")).encoding,
          )
        end
      end

      describe "given a String and an encoding" do
        it "should detect shift_jis" do
          # fails
          assert_equal(
            "Shift_JIS",
            Nokogiri::HTML5::Document.parse(File.read(SHIFT_JIS_HTML), nil, "Shift_JIS").encoding,
          )
        end
      end
    end
  end
end

yields

Error:
document encoding::HTML5::given a File and an encoding#test_0001_should detect shift_jis:
TypeError: no implicit conversion of Hash into Integer
    /home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5.rb:266:in `read'
    /home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5.rb:266:in `read_and_encode'
    /home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5/document.rb:119:in `do_parse'
    /home/flavorjones/code/oss/nokogiri/lib/nokogiri/html5/document.rb:95:in `parse'
    ./html5-document-encoding.rb:64:in `block (4 levels) in <class:Test>'

Failure:
document encoding::HTML5::given a File#test_0001_should detect shift_jis [./html5-document-encoding.rb:52]
Minitest::Assertion: Expected: "Shift_JIS"
  Actual: "UTF-8"

Failure:
document encoding::HTML5::given a String and an encoding#test_0001_should detect shift_jis [./html5-document-encoding.rb:82]
Minitest::Assertion: Expected: "Shift_JIS"
  Actual: "UTF-8"

Failure:
document encoding::HTML5::given a String#test_0001_should detect shift_jis [./html5-document-encoding.rb:72]
Minitest::Assertion: Expected: "Shift_JIS"
  Actual: "UTF-8"

Expected behavior

I think these should both be the same?

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions