Skip to content

Commit

Permalink
Add charset_handler option to Mail.Parsers.RFC2822
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtimberlake committed Oct 10, 2024
1 parent def05fc commit f4b3772
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 79 deletions.
59 changes: 35 additions & 24 deletions lib/mail/parsers/rfc_2822.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,28 @@ defmodule Mail.Parsers.RFC2822 do
"december" => "dec"
}

@spec parse(binary() | nonempty_maybe_improper_list()) :: Mail.Message.t()
def parse(content)
@doc """
Parses a RFC2822 message back into a `%Mail.Message{}` data model.
## Options
* `:charset_handler` - A function that takes a charset and binary and returns a binary. Defaults to return the string as is.
def parse([_ | _] = lines) do
"""
@spec parse(binary() | nonempty_maybe_improper_list(), keyword()) :: Mail.Message.t()
def parse(content, opts \\ [])

def parse([_ | _] = lines, opts) do
[headers, lines] = extract_headers(lines)

%Mail.Message{}
|> parse_headers(headers)
|> mark_multipart
|> parse_body(lines)
|> parse_headers(headers, opts)
|> mark_multipart()
|> parse_body(lines, opts)
end

def parse(content),
do: content |> String.split("\r\n") |> Enum.map(&String.trim_trailing/1) |> parse
def parse(content, opts),
do: content |> String.split("\r\n") |> Enum.map(&String.trim_trailing/1) |> parse(opts)

defp extract_headers(list, headers \\ [])

Expand Down Expand Up @@ -294,18 +302,18 @@ defmodule Mail.Parsers.RFC2822 do
end)
end

defp parse_headers(message, []), do: message
defp parse_headers(message, [], _opts), do: message

defp parse_headers(message, [header | tail]) do
defp parse_headers(message, [header | tail], opts) do
[name, body] = String.split(header, ":", parts: 2)
key = String.downcase(name)
decoded = parse_encoded_word(body)
decoded = parse_encoded_word(body, opts)

headers =
put_header(message.headers, key, String.downcase(name) |> parse_header_value(decoded))

message = %{message | headers: headers}
parse_headers(message, tail)
parse_headers(message, tail, opts)
end

defp put_header(headers, "received" = key, value),
Expand Down Expand Up @@ -365,11 +373,11 @@ defmodule Mail.Parsers.RFC2822 do
do: value

# See https://tools.ietf.org/html/rfc2047
defp parse_encoded_word(""), do: ""
defp parse_encoded_word("", _opts), do: ""

defp parse_encoded_word(<<"=?", value::binary>>) do
defp parse_encoded_word(<<"=?", value::binary>>, opts) do
case String.split(value, "?", parts: 4) do
[_charset, encoding, encoded_string, <<"=", remainder::binary>>] ->
[charset, encoding, encoded_string, <<"=", remainder::binary>>] ->
decoded_string =
case String.upcase(encoding) do
"Q" ->
Expand All @@ -379,19 +387,22 @@ defmodule Mail.Parsers.RFC2822 do
Mail.Encoders.Base64.decode(encoded_string)
end

charset_handler = Keyword.get(opts, :charset_handler, fn _, string -> string end)
decoded_string = charset_handler.(charset, decoded_string)

# Remove space if immediately followed by another encoded word string
remainder = Regex.replace(~r/\s+\=\?/, remainder, "=?")

decoded_string <> parse_encoded_word(remainder)
decoded_string <> parse_encoded_word(remainder, opts)

_ ->
# Not an encoded word, moving on
"=?" <> parse_encoded_word(value)
"=?" <> parse_encoded_word(value, opts)
end
end

defp parse_encoded_word(<<char::utf8, rest::binary>>),
do: <<char::utf8, parse_encoded_word(rest)::binary>>
defp parse_encoded_word(<<char::utf8, rest::binary>>, opts),
do: <<char::utf8, parse_encoded_word(rest, opts)::binary>>

defp parse_structured_header_value(string, value \\ nil, sub_types \\ [], acc \\ "")

Expand Down Expand Up @@ -495,28 +506,28 @@ defmodule Mail.Parsers.RFC2822 do
defp remove_excess_whitespace(<<char::utf8, rest::binary>>),
do: <<char::utf8, remove_excess_whitespace(rest)::binary>>

defp parse_body(%Mail.Message{multipart: true} = message, lines) do
defp parse_body(%Mail.Message{multipart: true} = message, lines, opts) do
content_type = message.headers["content-type"]
boundary = Mail.Proplist.get(content_type, "boundary")

parts =
lines
|> extract_parts(boundary)
|> Enum.map(fn part ->
parse(part)
parse(part, opts)
end)

Map.put(message, :parts, parts)
end

defp parse_body(%Mail.Message{} = message, []) do
defp parse_body(%Mail.Message{} = message, [], _opts) do
message
end

defp parse_body(%Mail.Message{} = message, lines) do
defp parse_body(%Mail.Message{} = message, lines, _opts) do
decoded =
lines
|> join_body
|> join_body()
|> decode(message)

Map.put(message, :body, decoded)
Expand Down
163 changes: 108 additions & 55 deletions test/mail/parsers/rfc_2822_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -776,65 +776,118 @@ defmodule Mail.Parsers.RFC2822Test do
end

test "parses Windows-1252 encoded filenames" do
message =
parse_email("""
To: [email protected]
From: [email protected]
Subject: Test
Content-Type: multipart/mixed;
boundary="----=_Part_295474_20544590.1456382229928"
------=_Part_295474_20544590.1456382229928
Content-Type: text/plain
This is some text
------=_Part_295474_20544590.1456382229928
Content-Type: application/octet-stream;
name="=?Windows-1252?Q?Imagin=E9.pdf?="
Content-Description: =?Windows-1252?Q?Imagine=E9.pdf?=
Content-Disposition: attachment;
filename="=?Windows-1252?Q?Imagine=E9.pdf?="; size=864872;
creation-date="Tue, 08 Oct 2024 14:16:59 GMT";
modification-date="Tue, 08 Oct 2024 14:16:59 GMT"
Content-Transfer-Encoding: base64
JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
------=_Part_295474_20544590.1456382229928
Content-Type: application/pdf;
name="=?windows-1258?Q?Pre=ECsentation.pdf?="
Content-Description: =?windows-1258?Q?Pre=ECsentation.pdf?=
Content-Disposition: attachment;
filename="=?windows-1258?Q?Pre=ECsentation.pdf?="; size=3827236;
creation-date="Wed, 11 Sep 2024 09:27:41 GMT";
modification-date="Wed, 09 Oct 2024 08:27:14 GMT"
Content-ID: <f_m0xno2c63>
Content-Transfer-Encoding: base64
email = """
To: [email protected]
From: [email protected]
Subject: Test
Content-Type: multipart/mixed;
boundary="----=_Part_295474_20544590.1456382229928"
------=_Part_295474_20544590.1456382229928
Content-Type: text/plain
This is some text
------=_Part_295474_20544590.1456382229928
Content-Type: application/octet-stream;
name="=?Windows-1252?Q?Imagin=E9.pdf?="
Content-Description: =?Windows-1252?Q?Imagine=E9.pdf?=
Content-Disposition: attachment;
filename="=?Windows-1252?Q?Imagine=E9.pdf?="; size=864872;
creation-date="Tue, 08 Oct 2024 14:16:59 GMT";
modification-date="Tue, 08 Oct 2024 14:16:59 GMT"
Content-Transfer-Encoding: base64
JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
------=_Part_295474_20544590.1456382229928
Content-Type: application/pdf;
name="=?windows-1258?Q?Pre=ECsentation.pdf?="
Content-Description: =?windows-1258?Q?Pre=ECsentation.pdf?=
Content-Disposition: attachment;
filename="=?windows-1258?Q?Pre=ECsentation.pdf?="; size=3827236;
creation-date="Wed, 11 Sep 2024 09:27:41 GMT";
modification-date="Wed, 09 Oct 2024 08:27:14 GMT"
Content-ID: <f_m0xno2c63>
Content-Transfer-Encoding: base64
JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
------=_Part_295474_20544590.1456382229928
Content-Type: application/octet-stream;
name="=?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?="
Content-Description: =?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?=
Content-Disposition: attachment;
filename="=?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?=";
size=19791; creation-date="Tue, 08 Oct 2024 14:16:55 GMT";
modification-date="Tue, 08 Oct 2024 14:16:55 GMT"
Content-Transfer-Encoding: base64
JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
------=_Part_295474_20544590.1456382229928
"""

message = parse_email(email)
assert [part1, part2, part3, part4] = message.parts

JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
assert %{headers: %{"content-type" => ["text/plain" | _]}} = part1

------=_Part_295474_20544590.1456382229928
Content-Type: application/octet-stream;
name="=?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?="
Content-Description: =?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?=
Content-Disposition: attachment;
filename="=?Windows-1252?Q?ID_S=E9_-_Liste_inscrits.xlsx?=";
size=19791; creation-date="Tue, 08 Oct 2024 14:16:55 GMT";
modification-date="Tue, 08 Oct 2024 14:16:55 GMT"
Content-Transfer-Encoding: base64
assert %{
headers: %{
"content-type" => ["application/octet-stream", {"name", "Imagin\xE9.pdf"}]
}
} = part2

assert %{headers: %{"content-type" => ["application/pdf", {"name", "Pre\xECsentation.pdf"}]}} =
part3

assert %{
headers: %{
"content-type" => [
"application/octet-stream",
{"name", "ID S\xE9 - Liste inscrits.xlsx"}
]
}
} = part4

# This is a simple character replacement function that simulates charset change from Windows-1252/1258 to UTF-8
message =
parse_email(email,
charset_handler: fn _charset, string ->
string
|> String.graphemes()
|> Enum.map(fn
# Windows-1252
<<233>> -> "é"
# Windows-1258
<<236>> -> "\u0301"
char -> char
end)
|> Enum.join()
end
)

JVBERi0xLjcKJeLjz9MKNiAwIG9iago8PCAvQ3JlYXRvciAoT3BlblRleHQgRXhzdHJlYW0gVmVy
assert [part1, part2, part3, part4] = message.parts
assert %{headers: %{"content-type" => ["text/plain" | _]}} = part1

------=_Part_295474_20544590.1456382229928
""")
assert %{
headers: %{
"content-type" => ["application/octet-stream", {"name", "Imaginé.pdf"}]
}
} = part2

assert [part1, part2, part3, part4] = message.parts
assert %{headers: %{"content-type" => ["application/pdf", {"name", "Présentation.pdf"}]}} =
part3

assert %{headers: %{"content-type" => ["text/plain" | _]}} = part1
assert %{headers: %{"content-type" => ["application/octet-stream", {"name", "Imagin\xE9.pdf"}]}} = part2
assert %{headers: %{"content-type" => ["application/pdf", {"name", "Pre\xECsentation.pdf"}]}} = part3
assert %{headers: %{"content-type" => ["application/octet-stream", {"name", "ID S\xE9 - Liste inscrits.xlsx"}]}} = part4
assert %{
headers: %{
"content-type" => [
"application/octet-stream",
{"name", "ID Sé - Liste inscrits.xlsx"}
]
}
} = part4
end

test "content-type mixed with no body" do
Expand Down Expand Up @@ -879,8 +932,8 @@ defmodule Mail.Parsers.RFC2822Test do
assert message.headers["content-type"] == ["text/html", {"charset", "us-ascii"}]
end

defp parse_email(email),
do: email |> convert_crlf |> Mail.Parsers.RFC2822.parse()
defp parse_email(email, opts \\ []),
do: email |> convert_crlf |> Mail.Parsers.RFC2822.parse(opts)

defp parse_recipient(recipient),
do: Mail.Parsers.RFC2822.parse_recipient_value(recipient)
Expand Down

0 comments on commit f4b3772

Please sign in to comment.