Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit which text codecs are supported. #2896

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions httpx/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,52 @@
r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()])
)

# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023.
# https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36
#
# Then limit them to only includec codecs which are documented as included by cpython.
# https://docs.python.org/3/library/codecs.html#standard-encodings
#
# We're referencing them with the canonical name as used by the Python codecs.
# The alias given in the chromium source is included as a comment for comparison.
Comment on lines +32 to +33
Copy link
Member Author

@tomchristie tomchristie Nov 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're referencing these character sets with the canonical name as used by the Python codecs.

  • Will the canonical name for the codec be consistent across different Python implementations?
  • Should we instead just be including all the possible aliases explicitly?
  • What set of aliases does chromium support for these charset names?

SUPPORTED_CODECS = {
"big5", # big5
"big5hkscs", # big5-hkscs
"cp1250", # windows-1250
"cp1251", # windows-1251
"cp1252", # windows-1252
"cp1253", # windows-1253
"cp1254", # windows-1254
"cp1255", # windows-1255
"cp1256", # windows-1256
"cp1257", # windows-1257
"cp1258", # windows-1258
"euc_jp", # euc-jp
"euc_kr", # euc-kr
"gb18030", # gb18030
"gbk", # gbk
"iso2022_jp", # iso-2022-jp
"iso8859-1", # iso-8859-1
"iso8859-2", # iso-8859-2
"iso8859-3", # iso-8859-3
"iso8859-4", # iso-8859-4
"iso8859-5", # iso-8859-5
"iso8859-6", # iso-8859-6
"iso8859-7", # iso-8859-7
"iso8859-8", # iso-8859-8
"iso8859-10", # iso-8859-10
"iso8859-13", # iso-8859-13
"iso8859-14", # iso-8859-14
"iso8859-15", # iso-8859-15
"iso8859-16", # iso-8859-16
"koi8-r", # koi8-r
"koi8-u", # koi8-u
"mac-roman", # macintosh
"shift_jis", # shift-jis
"utf-8", # utf-8
"utf-16-le", # utf-16le
}


def normalize_header_key(
value: typing.Union[str, bytes],
Expand Down Expand Up @@ -70,13 +116,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str:

def is_known_encoding(encoding: str) -> bool:
"""
Return `True` if `encoding` is a known codec.
Return `True` if `encoding` is a supported text codec.
"""
try:
codecs.lookup(encoding)
codec = codecs.lookup(encoding)
except LookupError:
return False
return True

return codec.name in SUPPORTED_CODECS


def format_form_param(name: str, value: str) -> bytes:
Expand Down