diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/email_validator | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/email_validator')
9 files changed, 1454 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/email_validator/__init__.py b/.venv/lib/python3.12/site-packages/email_validator/__init__.py new file mode 100644 index 00000000..626aa002 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/__init__.py @@ -0,0 +1,101 @@ +from typing import TYPE_CHECKING + +# Export the main method, helper methods, and the public data types. +from .exceptions_types import ValidatedEmail, EmailNotValidError, \ + EmailSyntaxError, EmailUndeliverableError +from .validate_email import validate_email +from .version import __version__ + +__all__ = ["validate_email", + "ValidatedEmail", "EmailNotValidError", + "EmailSyntaxError", "EmailUndeliverableError", + "caching_resolver", "__version__"] + +if TYPE_CHECKING: + from .deliverability import caching_resolver +else: + def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver + + return caching_resolver(*args, **kwargs) + + +# These global attributes are a part of the library's API and can be +# changed by library users. + +# Default values for keyword arguments. + +ALLOW_SMTPUTF8 = True +ALLOW_QUOTED_LOCAL = False +ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False +GLOBALLY_DELIVERABLE = True +CHECK_DELIVERABILITY = True +TEST_ENVIRONMENT = False +DEFAULT_TIMEOUT = 15 # secs + +# IANA Special Use Domain Names +# Last Updated 2021-09-21 +# https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.txt +# +# The domain names without dots would be caught by the check that the domain +# name in an email address must have a period, but this list will also catch +# subdomains of these domains, which are also reserved. +SPECIAL_USE_DOMAIN_NAMES = [ + # The "arpa" entry here is consolidated from a lot of arpa subdomains + # for private address (i.e. non-routable IP addresses like 172.16.x.x) + # reverse mapping, plus some other subdomains. Although RFC 6761 says + # that application software should not treat these domains as special, + # they are private-use domains and so cannot have globally deliverable + # email addresses, which is an assumption of this library, and probably + # all of arpa is similarly special-use, so we reject it all. + "arpa", + + # RFC 6761 says applications "SHOULD NOT" treat the "example" domains + # as special, i.e. applications should accept these domains. + # + # The domain "example" alone fails our syntax validation because it + # lacks a dot (we assume no one has an email address on a TLD directly). + # "@example.com/net/org" will currently fail DNS-based deliverability + # checks because IANA publishes a NULL MX for these domains, and + # "@mail.example[.com/net/org]" and other subdomains will fail DNS- + # based deliverability checks because IANA does not publish MX or A + # DNS records for these subdomains. + # "example", # i.e. "wwww.example" + # "example.com", + # "example.net", + # "example.org", + + # RFC 6761 says that applications are permitted to treat this domain + # as special and that DNS should return an immediate negative response, + # so we also immediately reject this domain, which also follows the + # purpose of the domain. + "invalid", + + # RFC 6762 says that applications "may" treat ".local" as special and + # that "name resolution APIs and libraries SHOULD recognize these names + # as special," and since ".local" has no global definition, we reject + # it, as we expect email addresses to be gloally routable. + "local", + + # RFC 6761 says that applications (like this library) are permitted + # to treat "localhost" as special, and since it cannot have a globally + # deliverable email address, we reject it. + "localhost", + + # RFC 7686 says "applications that do not implement the Tor protocol + # SHOULD generate an error upon the use of .onion and SHOULD NOT + # perform a DNS lookup. + "onion", + + # Although RFC 6761 says that application software should not treat + # these domains as special, it also warns users that the address may + # resolve differently in different systems, and therefore it cannot + # have a globally routable email address, which is an assumption of + # this library, so we reject "@test" and "@*.test" addresses, unless + # the test_environment keyword argument is given, to allow their use + # in application-level test environments. These domains will generally + # fail deliverability checks because "test" is not an actual TLD. + "test", +] diff --git a/.venv/lib/python3.12/site-packages/email_validator/__main__.py b/.venv/lib/python3.12/site-packages/email_validator/__main__.py new file mode 100644 index 00000000..52791c75 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/__main__.py @@ -0,0 +1,60 @@ +# A command-line tool for testing. +# +# Usage: +# +# python -m email_validator test@example.org +# python -m email_validator < LIST_OF_ADDRESSES.TXT +# +# Provide email addresses to validate either as a command-line argument +# or in STDIN separated by newlines. Validation errors will be printed for +# invalid email addresses. When passing an email address on the command +# line, if the email address is valid, information about it will be printed. +# When using STDIN, no output will be given for valid email addresses. +# +# Keyword arguments to validate_email can be set in environment variables +# of the same name but upprcase (see below). + +import json +import os +import sys +from typing import Any, Dict, Optional + +from .validate_email import validate_email, _Resolver +from .deliverability import caching_resolver +from .exceptions_types import EmailNotValidError + + +def main(dns_resolver: Optional[_Resolver] = None) -> None: + # The dns_resolver argument is for tests. + + # Set options from environment variables. + options: Dict[str, Any] = {} + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): + if varname in os.environ: + options[varname.lower()] = bool(os.environ[varname]) + for varname in ('DEFAULT_TIMEOUT',): + if varname in os.environ: + options[varname.lower()] = float(os.environ[varname]) + + if len(sys.argv) == 1: + # Validate the email addresses pased line-by-line on STDIN. + dns_resolver = dns_resolver or caching_resolver() + for line in sys.stdin: + email = line.strip() + try: + validate_email(email, dns_resolver=dns_resolver, **options) + except EmailNotValidError as e: + print(f"{email} {e}") + else: + # Validate the email address passed on the command line. + email = sys.argv[1] + try: + result = validate_email(email, dns_resolver=dns_resolver, **options) + print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) + except EmailNotValidError as e: + print(e) + + +if __name__ == "__main__": + main() diff --git a/.venv/lib/python3.12/site-packages/email_validator/deliverability.py b/.venv/lib/python3.12/site-packages/email_validator/deliverability.py new file mode 100644 index 00000000..90f5f9af --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/deliverability.py @@ -0,0 +1,159 @@ +from typing import Any, List, Optional, Tuple, TypedDict + +import ipaddress + +from .exceptions_types import EmailUndeliverableError + +import dns.resolver +import dns.exception + + +def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> dns.resolver.Resolver: + if timeout is None: + from . import DEFAULT_TIMEOUT + timeout = DEFAULT_TIMEOUT + resolver = dns_resolver or dns.resolver.Resolver() + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds + return resolver + + +DeliverabilityInfo = TypedDict("DeliverabilityInfo", { + "mx": List[Tuple[int, str]], + "mx_fallback_type": Optional[str], + "unknown-deliverability": str, +}, total=False) + + +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> DeliverabilityInfo: + # Check that the domain resolves to an MX record. If there is no MX record, + # try an A or AAAA record which is a deprecated fallback for deliverability. + # Raises an EmailUndeliverableError on failure. On success, returns a dict + # with deliverability information. + + # If no dns.resolver.Resolver was given, get dnspython's default resolver. + # Override the default resolver's timeout. This may affect other uses of + # dnspython in this process. + if dns_resolver is None: + from . import DEFAULT_TIMEOUT + if timeout is None: + timeout = DEFAULT_TIMEOUT + dns_resolver = dns.resolver.get_default_resolver() + dns_resolver.lifetime = timeout + elif timeout is not None: + raise ValueError("It's not valid to pass both timeout and dns_resolver.") + + deliverability_info: DeliverabilityInfo = {} + + try: + try: + # Try resolving for MX records (RFC 5321 Section 5). + response = dns_resolver.resolve(domain, "MX") + + # For reporting, put them in priority order and remove the trailing dot in the qnames. + mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) + + # RFC 7505: Null MX (0, ".") records signify the domain does not accept email. + # Remove null MX records from the mtas list (but we've stripped trailing dots, + # so the 'exchange' is just "") so we can check if there are no non-null MX + # records remaining. + mtas = [(preference, exchange) for preference, exchange in mtas + if exchange != ""] + if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") + + deliverability_info["mx"] = mtas + deliverability_info["mx_fallback_type"] = None + + except dns.resolver.NoAnswer: + # If there was no MX record, fall back to an A or AAA record + # (RFC 5321 Section 5). Check A first since it's more common. + + # If the A/AAAA response has no Globally Reachable IP address, + # treat the response as if it were NoAnswer, i.e., the following + # address types are not allowed fallbacks: Private-Use, Loopback, + # Link-Local, and some other obscure ranges. See + # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml + # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml + # (Issue #134.) + def is_global_addr(address: Any) -> bool: + try: + ipaddr = ipaddress.ip_address(address) + except ValueError: + return False + return ipaddr.is_global + + try: + response = dns_resolver.resolve(domain, "A") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer # fall back to AAAA + + deliverability_info["mx"] = [(0, domain)] + deliverability_info["mx_fallback_type"] = "A" + + except dns.resolver.NoAnswer: + + # If there was no A record, fall back to an AAAA record. + # (It's unclear if SMTP servers actually do this.) + try: + response = dns_resolver.resolve(domain, "AAAA") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer + + deliverability_info["mx"] = [(0, domain)] + deliverability_info["mx_fallback_type"] = "AAAA" + + except dns.resolver.NoAnswer as e: + # If there was no MX, A, or AAAA record, then mail to + # this domain is not deliverable, although the domain + # name has other records (otherwise NXDOMAIN would + # have been raised). + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") from e + + # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates + # no emails are sent from this domain (similar to a Null MX record + # but for sending rather than receiving). In combination with the + # absence of an MX record, this is probably a good sign that the + # domain is not used for email. + try: + response = dns_resolver.resolve(domain, "TXT") + for rec in response: + value = b"".join(rec.strings) + if value.startswith(b"v=spf1 "): + if value == b"v=spf1 -all": + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") + except dns.resolver.NoAnswer: + # No TXT records means there is no SPF policy, so we cannot take any action. + pass + + except dns.resolver.NXDOMAIN as e: + # The domain name does not exist --- there are no records of any sort + # for the domain name. + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not exist.") from e + + except dns.resolver.NoNameservers: + # All nameservers failed to answer the query. This might be a problem + # with local nameservers, maybe? We'll allow the domain to go through. + return { + "unknown-deliverability": "no_nameservers", + } + + except dns.exception.Timeout: + # A timeout could occur for various reasons, so don't treat it as a failure. + return { + "unknown-deliverability": "timeout", + } + + except EmailUndeliverableError: + # Don't let these get clobbered by the wider except block below. + raise + + except Exception as e: + # Unhandled conditions should not propagate. + raise EmailUndeliverableError( + "There was an error while checking if the domain name in the email address is deliverable: " + str(e) + ) from e + + return deliverability_info diff --git a/.venv/lib/python3.12/site-packages/email_validator/exceptions_types.py b/.venv/lib/python3.12/site-packages/email_validator/exceptions_types.py new file mode 100644 index 00000000..928a94fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/exceptions_types.py @@ -0,0 +1,141 @@ +import warnings +from typing import Any, Dict, List, Optional, Tuple, Union + + +class EmailNotValidError(ValueError): + """Parent class of all exceptions raised by this module.""" + pass + + +class EmailSyntaxError(EmailNotValidError): + """Exception raised when an email address fails validation because of its form.""" + pass + + +class EmailUndeliverableError(EmailNotValidError): + """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" + pass + + +class ValidatedEmail: + """The validate_email function returns objects of this type holding the normalized form of the email address + and other information.""" + + """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" + original: str + + """The normalized email address, which should always be used in preference to the original address. + The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs + Unicode normalization on the local part and on the domain (if originally Unicode). It is the + concatenation of the local_part and domain attributes, separated by an @-sign.""" + normalized: str + + """The local part of the email address after Unicode normalization.""" + local_part: str + + """The domain part of the email address after Unicode normalization or conversion to + Unicode from IDNA ascii.""" + domain: str + + """If the domain part is a domain literal, the IPv4Address or IPv6Address object.""" + domain_address: object + + """If not None, a form of the email address that uses 7-bit ASCII characters only.""" + ascii_email: Optional[str] + + """If not None, the local part of the email address using 7-bit ASCII characters only.""" + ascii_local_part: Optional[str] + + """A form of the domain name that uses 7-bit ASCII characters only.""" + ascii_domain: str + + """If True, the SMTPUTF8 feature of your mail relay will be required to transmit messages + to this address. This flag is True just when ascii_local_part is missing. Otherwise it + is False.""" + smtputf8: bool + + """If a deliverability check is performed and if it succeeds, a list of (priority, domain) + tuples of MX records specified in the DNS for the domain.""" + mx: List[Tuple[int, str]] + + """If no MX records are actually specified in DNS and instead are inferred, through an obsolete + mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" + mx_fallback_type: Optional[str] + + """The display name in the original input text, unquoted and unescaped, or None.""" + display_name: Optional[str] + + def __repr__(self) -> str: + return f"<ValidatedEmail {self.normalized}>" + + """For backwards compatibility, support old field names.""" + def __getattr__(self, key: str) -> str: + if key == "original_email": + return self.original + if key == "email": + return self.normalized + raise AttributeError(key) + + @property + def email(self) -> str: + warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) + return self.normalized + + """For backwards compatibility, some fields are also exposed through a dict-like interface. Note + that some of the names changed when they became attributes.""" + def __getitem__(self, key: str) -> Union[Optional[str], bool, List[Tuple[int, str]]]: + warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) + if key == "email": + return self.normalized + if key == "email_ascii": + return self.ascii_email + if key == "local": + return self.local_part + if key == "domain": + return self.ascii_domain + if key == "domain_i18n": + return self.domain + if key == "smtputf8": + return self.smtputf8 + if key == "mx": + return self.mx + if key == "mx-fallback": + return self.mx_fallback_type + raise KeyError() + + """Tests use this.""" + def __eq__(self, other: object) -> bool: + if not isinstance(other, ValidatedEmail): + return False + return ( + self.normalized == other.normalized + and self.local_part == other.local_part + and self.domain == other.domain + and getattr(self, 'ascii_email', None) == getattr(other, 'ascii_email', None) + and getattr(self, 'ascii_local_part', None) == getattr(other, 'ascii_local_part', None) + and getattr(self, 'ascii_domain', None) == getattr(other, 'ascii_domain', None) + and self.smtputf8 == other.smtputf8 + and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) + == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) + and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) + ) + + """This helps producing the README.""" + def as_constructor(self) -> str: + return "ValidatedEmail(" \ + + ",".join(f"\n {key}={repr(getattr(self, key))}" + for key in ('normalized', 'local_part', 'domain', + 'ascii_email', 'ascii_local_part', 'ascii_domain', + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') + if hasattr(self, key) + ) \ + + ")" + + """Convenience method for accessing ValidatedEmail as a dict""" + def as_dict(self) -> Dict[str, Any]: + d = self.__dict__ + if d.get('domain_address'): + d['domain_address'] = repr(d['domain_address']) + return d diff --git a/.venv/lib/python3.12/site-packages/email_validator/py.typed b/.venv/lib/python3.12/site-packages/email_validator/py.typed new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/py.typed diff --git a/.venv/lib/python3.12/site-packages/email_validator/rfc_constants.py b/.venv/lib/python3.12/site-packages/email_validator/rfc_constants.py new file mode 100644 index 00000000..39d8e315 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/rfc_constants.py @@ -0,0 +1,51 @@ +# These constants are defined by the email specifications. + +import re + +# Based on RFC 5322 3.2.3, these characters are permitted in email +# addresses (not taking into account internationalization) separated by dots: +ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' +ATEXT_RE = re.compile('[.' + ATEXT + ']') # ATEXT plus dots +DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z') + +# RFC 6531 3.3 extends the allowed characters in internationalized +# addresses to also include three specific ranges of UTF8 defined in +# RFC 3629 section 4, which appear to be the Unicode code points from +# U+0080 to U+10FFFF. +ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') + +# The domain part of the email address, after IDNA (ASCII) encoding, +# must also satisfy the requirements of RFC 952/RFC 1123 2.1 which +# restrict the allowed characters of hostnames further. +ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]") +HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' +DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') +DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter + +# Domain literal (RFC 5322 3.4.1) +DOMAIN_LITERAL_CHARS = re.compile(r"[\u0021-\u00FA\u005E-\u007E]") + +# Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) +# The permitted characters in a quoted string are the characters in the range +# 32-126, except that quotes and (literal) backslashes can only appear when escaped +# by a backslash. When internationalized, UTF-8 strings are also permitted except +# the ASCII characters that are not previously permitted (see above). +# QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") +QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") + +# Length constants +# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) +# explains the maximum length of an email address is 254 octets. +EMAIL_MAX_LENGTH = 254 +LOCAL_PART_MAX_LENGTH = 64 +DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted, RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2, and see https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name + +# RFC 2142 +CASE_INSENSITIVE_MAILBOX_NAMES = [ + 'info', 'marketing', 'sales', 'support', # section 3 + 'abuse', 'noc', 'security', # section 4 + 'postmaster', 'hostmaster', 'usenet', 'news', 'webmaster', 'www', 'uucp', 'ftp', # section 5 +] diff --git a/.venv/lib/python3.12/site-packages/email_validator/syntax.py b/.venv/lib/python3.12/site-packages/email_validator/syntax.py new file mode 100644 index 00000000..c6554518 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/syntax.py @@ -0,0 +1,761 @@ +from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS + +import re +import unicodedata +import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +import ipaddress +from typing import Optional, Tuple, TypedDict, Union + + +def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name <addr>` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes and + # is not followed by a Unicode combining character. + # If no special character is found, raise an error. + inside_quote = False + escaped = False + left_part = "" + for i, c in enumerate(text): + # < plus U+0338 (Combining Long Solidus Overlay) normalizes to + # ≮ U+226E (Not Less-Than), and it would be confusing to treat + # the < as the start of "<email>" syntax in that case. Liekwise, + # if anything combines with an @ or ", we should probably not + # treat it as a special character. + if unicodedata.normalize("NFC", text[i:])[0] != c: + left_part += c + + elif inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + if len(left_part) == len(text): + raise EmailSyntaxError("An email address must have an @-sign.") + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text: str) -> Tuple[str, bool]: + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) + + # Check that the right part ends with an angle bracket + # but allow spaces after it, I guess. + if ">" not in right_part: + raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + right_part = right_part.rstrip(" ") + if right_part[-1] != ">": + raise EmailSyntaxError("There can't be anything after the email address.") + + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") + + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. + else: + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) + + return display_name, local_part, domain_part, is_quoted_local_part + + +def get_length_reason(addr: str, limit: int) -> str: + """Helper function to return an error message related to invalid length.""" + diff = len(addr) - limit + suffix = "s" if diff > 1 else "" + return f"({diff} character{suffix} too many)" + + +def safe_character_display(c: str) -> str: + # Return safely displayable characters in quotes. + if c == '\\': + return f"\"{c}\"" # can't use repr because it escapes it + if unicodedata.category(c)[0] in ("L", "N", "P", "S"): + return repr(c) + + # Construct a hex string in case the unicode name doesn't exist. + if ord(c) < 0xFFFF: + h = f"U+{ord(c):04x}".upper() + else: + h = f"U+{ord(c):08x}".upper() + + # Return the character name or, if it has no name, the hex string. + return unicodedata.name(c, h) + + +class LocalPartValidationResult(TypedDict): + local_part: str + ascii_local_part: Optional[str] + smtputf8: bool + + +def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, + quoted_local_part: bool = False) -> LocalPartValidationResult: + """Validates the syntax of the local part of an email address.""" + + if len(local) == 0: + if not allow_empty_local: + raise EmailSyntaxError("There must be something before the @-sign.") + + # The caller allows an empty local part. Useful for validating certain + # Postfix aliases. + return { + "local_part": local, + "ascii_local_part": local, + "smtputf8": False, + } + + # Check the length of the local part by counting characters. + # (RFC 5321 4.5.3.1.1) + # We're checking the number of characters here. If the local part + # is ASCII-only, then that's the same as bytes (octets). If it's + # internationalized, then the UTF-8 encoding may be longer, but + # that may not be relevant. We will check the total address length + # instead. + if len(local) > LOCAL_PART_MAX_LENGTH: + reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") + + # Check the local part against the non-internationalized regular expression. + # Most email addresses match this regex so it's probably fastest to check this first. + # (RFC 5322 3.2.3) + # All local parts matching the dot-atom rule are also valid as a quoted string + # so if it was originally quoted (quoted_local_part is True) and this regex matches, + # it's ok. + # (RFC 5321 4.1.2 / RFC 5322 3.2.4). + if DOT_ATOM_TEXT.match(local): + # It's valid. And since it's just the permitted ASCII characters, + # it's normalized and safe. If the local part was originally quoted, + # the quoting was unnecessary and it'll be returned as normalized to + # non-quoted form. + + # Return the local part and flag that SMTPUTF8 is not needed. + return { + "local_part": local, + "ascii_local_part": local, + "smtputf8": False, + } + + # The local part failed the basic dot-atom check. Try the extended character set + # for internationalized addresses. It's the same pattern but with additional + # characters permitted. + # RFC 6531 section 3.3. + valid: Optional[str] = None + requires_smtputf8 = False + if DOT_ATOM_TEXT_INTL.match(local): + # But international characters in the local part may not be permitted. + if not allow_smtputf8: + # Check for invalid characters against the non-internationalized + # permitted character set. + # (RFC 5322 3.2.3) + bad_chars = { + safe_character_display(c) + for c in local + if not ATEXT_RE.match(c) + } + if bad_chars: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + + # Although the check above should always find something, fall back to this just in case. + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") + + # It's valid. + valid = "dot-atom" + requires_smtputf8 = True + + # There are no syntactic restrictions on quoted local parts, so if + # it was originally quoted, it is probably valid. More characters + # are allowed, like @-signs, spaces, and quotes, and there are no + # restrictions on the placement of dots, as in dot-atom local parts. + elif quoted_local_part: + # Check for invalid characters in a quoted string local part. + # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* + # characters which are *not* allowed here. RFC 6531 section 3.3 + # extends the range to UTF8 strings.) + bad_chars = { + safe_character_display(c) + for c in local + if not QTEXT_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # See if any characters are outside of the ASCII range. + bad_chars = { + safe_character_display(c) + for c in local + if not (32 <= ord(c) <= 126) + } + if bad_chars: + requires_smtputf8 = True + + # International characters in the local part may not be permitted. + if not allow_smtputf8: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + + # It's valid. + valid = "quoted" + + # If the local part matches the internationalized dot-atom form or was quoted, + # perform additional checks for Unicode strings. + if valid: + # Check that the local part is a valid, safe, and sensible Unicode string. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the + # email specs, but they may not be valid, safe, or sensible Unicode strings. + # See the function for rationale. + check_unsafe_chars(local, allow_space=(valid == "quoted")) + + # Try encoding to UTF-8. Failure is possible with some characters like + # surrogate code points, but those are checked above. Still, we don't + # want to have an unhandled exception later. + try: + local.encode("utf8") + except ValueError as e: + raise EmailSyntaxError("The email address contains an invalid character.") from e + + # If this address passes only by the quoted string form, re-quote it + # and backslash-escape quotes and backslashes (removing any unnecessary + # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, + # and the sending system SHOULD transmit the form that uses the minimum quoting possible." + if valid == "quoted": + local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' + + return { + "local_part": local, + "ascii_local_part": local if not requires_smtputf8 else None, + "smtputf8": requires_smtputf8, + } + + # It's not a valid local part. Let's find out why. + # (Since quoted local parts are all valid or handled above, these checks + # don't apply in those cases.) + + # Check for invalid characters. + # (RFC 5322 3.2.3, plus RFC 6531 3.3) + bad_chars = { + safe_character_display(c) + for c in local + if not ATEXT_INTL_DOT_RE.match(c) + } + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for dot errors imposted by the dot-atom rule. + # (RFC 5322 3.2.3) + check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) + + # All of the reasons should already have been checked, but just in case + # we have a fallback message. + raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") + + +def check_unsafe_chars(s: str, allow_space: bool = False) -> None: + # Check for unsafe characters or characters that would make the string + # invalid or non-sensible Unicode. + bad_chars = set() + for i, c in enumerate(s): + category = unicodedata.category(c) + if category[0] in ("L", "N", "P", "S"): + # Letters, numbers, punctuation, and symbols are permitted. + pass + elif category[0] == "M": + # Combining character in first position would combine with something + # outside of the email address if concatenated, so they are not safe. + # We also check if this occurs after the @-sign, which would not be + # sensible because it would modify the @-sign. + if i == 0: + bad_chars.add(c) + elif category == "Zs": + # Spaces outside of the ASCII range are not specifically disallowed in + # internationalized addresses as far as I can tell, but they violate + # the spirit of the non-internationalized specification that email + # addresses do not contain ASCII spaces when not quoted. Excluding + # ASCII spaces when not quoted is handled directly by the atom regex. + # + # In quoted-string local parts, spaces are explicitly permitted, and + # the ASCII space has category Zs, so we must allow it here, and we'll + # allow all Unicode spaces to be consistent. + if not allow_space: + bad_chars.add(c) + elif category[0] == "Z": + # The two line and paragraph separator characters (in categories Zl and Zp) + # are not specifically disallowed in internationalized addresses + # as far as I can tell, but they violate the spirit of the non-internationalized + # specification that email addresses do not contain line breaks when not quoted. + bad_chars.add(c) + elif category[0] == "C": + # Control, format, surrogate, private use, and unassigned code points (C) + # are all unsafe in various ways. Control and format characters can affect + # text rendering if the email address is concatenated with other text. + # Bidirectional format characters are unsafe, even if used properly, because + # they cause an email address to render as a different email address. + # Private use characters do not make sense for publicly deliverable + # email addresses. + bad_chars.add(c) + else: + # All categories should be handled above, but in case there is something new + # to the Unicode specification in the future, reject all other categories. + bad_chars.add(c) + if bad_chars: + raise EmailSyntaxError("The email address contains unsafe characters: " + + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") + + +def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: + # RFC 5322 3.2.3 + if label.endswith("."): + raise EmailSyntaxError(end_descr.format("period")) + if label.startswith("."): + raise EmailSyntaxError(start_descr.format("period")) + if ".." in label: + raise EmailSyntaxError("An email address cannot have two periods in a row.") + + if is_hostname: + # RFC 952 + if label.endswith("-"): + raise EmailSyntaxError(end_descr.format("hyphen")) + if label.startswith("-"): + raise EmailSyntaxError(start_descr.format("hyphen")) + if ".-" in label or "-." in label: + raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") + + +class DomainNameValidationResult(TypedDict): + ascii_domain: str + domain: str + + +def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: + """Validates the syntax of the domain part of an email address.""" + + # Check for invalid characters. + # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for unsafe characters. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but + # they may not be valid, safe, or sensible Unicode strings. + check_unsafe_chars(domain) + + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, + # and converting all label separators (the period/full stop, fullwidth full stop, + # ideographic full stop, and halfwidth ideographic full stop) to regular dots. + # It will also raise an exception if there is an invalid character in the input, + # such as "⒈" which is invalid because it would expand to include a dot and + # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. + # Since several characters *are* normalized to a dot, this has to come before + # checks related to dots, like check_dot_atom which comes next. + original_domain = domain + try: + domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + + # The domain part is made up dot-separated "labels." Each label must + # have at least one character and cannot start or end with dashes, which + # means there are some surprising restrictions on periods and dashes. + # Check that before we do IDNA encoding because the IDNA library gives + # unfriendly errors for these cases, but after UTS-46 normalization because + # it can insert periods and hyphens (from fullwidth characters). + # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) + check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) + + # Check for RFC 5890's invalid R-LDH labels, which are labels that start + # with two characters other than "xn" and two dashes. + for label in domain.split("."): + if re.match(r"(?!xn)..--", label, re.I): + raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") + + if DOT_ATOM_TEXT_HOSTNAME.match(domain): + # This is a valid non-internationalized domain. + ascii_domain = domain + else: + # If international characters are present in the domain name, convert + # the domain to IDNA ASCII. If internationalized characters are present, + # the MTA must either support SMTPUTF8 or the mail client must convert the + # domain name to IDNA before submission. + # + # For ASCII-only domains, the transformation does nothing and is safe to + # apply. However, to ensure we don't rely on the idna library for basic + # syntax checks, we don't use it if it's not needed. + # + # idna.encode also checks the domain name length after encoding but it + # doesn't give a nice error, so we call the underlying idna.alabel method + # directly. idna.alabel checks label length and doesn't give great messages, + # but we can't easily go to lower level methods. + try: + ascii_domain = ".".join( + idna.alabel(label).decode("ascii") + for label in domain.split(".") + ) + except idna.IDNAError as e: + # Some errors would have already been raised by idna.uts46_remap. + raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e + + # Check the syntax of the string returned by idna.encode. + # It should never fail. + if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): + raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") + + # Check the length of the domain name in bytes. + # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) + # We're checking the number of bytes ("octets") here, which can be much + # higher than the number of characters in internationalized domains, + # on the assumption that the domain may be transmitted without SMTPUTF8 + # as IDNA ASCII. (This is also checked by idna.encode, so this exception + # is never reached for internationalized domains.) + if len(ascii_domain) > DOMAIN_MAX_LENGTH: + if ascii_domain == original_domain: + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + else: + diff = len(ascii_domain) - DOMAIN_MAX_LENGTH + s = "" if diff == 1 else "s" + raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") + + # Also check the label length limit. + # (RFC 1035 2.3.1) + for label in ascii_domain.split("."): + if len(label) > DNS_LABEL_LENGTH_LIMIT: + reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) + raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") + + if globally_deliverable: + # All publicly deliverable addresses have domain names with at least + # one period, at least for gTLDs created since 2013 (per the ICANN Board + # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). + # We'll consider the lack of a period a syntax error + # since that will match people's sense of what an email address looks + # like. We'll skip this in test environments to allow '@test' email + # addresses. + if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): + raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") + + # We also know that all TLDs currently end with a letter. + if not DOMAIN_NAME_REGEX.search(ascii_domain): + raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") + + # Check special-use and reserved domain names. + # Some might fail DNS-based deliverability checks, but that + # can be turned off, so we should fail them all sooner. + # See the references in __init__.py. + from . import SPECIAL_USE_DOMAIN_NAMES + for d in SPECIAL_USE_DOMAIN_NAMES: + # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. + if d == "test" and test_environment: + continue + + if ascii_domain == d or ascii_domain.endswith("." + d): + raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") + + # We may have been given an IDNA ASCII domain to begin with. Check + # that the domain actually conforms to IDNA. It could look like IDNA + # but not be actual IDNA. For ASCII-only domains, the conversion out + # of IDNA just gives the same thing back. + # + # This gives us the canonical internationalized form of the domain, + # which we return to the caller as a part of the normalized email + # address. + try: + domain_i18n = idna.decode(ascii_domain.encode('ascii')) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e + + # Check that this normalized domain name has not somehow become + # an invalid domain name. All of the checks before this point + # using the idna package probably guarantee that we now have + # a valid international domain name in most respects. But it + # doesn't hurt to re-apply some tests to be sure. See the similar + # tests above. + + # Check for invalid and unsafe characters. We have no test + # case for this. + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + check_unsafe_chars(domain) + + # Check that it can be encoded back to IDNA ASCII. We have no test + # case for this. + try: + idna.encode(domain_i18n) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e + + # Return the IDNA ASCII-encoded form of the domain, which is how it + # would be transmitted on the wire (except when used with SMTPUTF8 + # possibly), as well as the canonical Unicode form of the domain, + # which is better for display purposes. This should also take care + # of RFC 6532 section 3.1's suggestion to apply Unicode NFC + # normalization to addresses. + return { + "ascii_domain": ascii_domain, + "domain": domain_i18n, + } + + +def validate_email_length(addrinfo: ValidatedEmail) -> None: + # There are three forms of the email address whose length must be checked: + # + # 1) The original email address string. Since callers may continue to use + # this string, even though we recommend using the normalized form, we + # should not pass validation when the original input is not valid. This + # form is checked first because it is the original input. + # 2) The normalized email address. We perform Unicode NFC normalization of + # the local part, we normalize the domain to internationalized characters + # (if originaly IDNA ASCII) which also includes Unicode normalization, + # and we may remove quotes in quoted local parts. We recommend that + # callers use this string, so it must be valid. + # 3) The email address with the IDNA ASCII representation of the domain + # name, since this string may be used with email stacks that don't + # support UTF-8. Since this is the least likely to be used by callers, + # it is checked last. Note that ascii_email will only be set if the + # local part is ASCII, but conceivably the caller may combine a + # internationalized local part with an ASCII domain, so we check this + # on that combination also. Since we only return the normalized local + # part, we use that (and not the unnormalized local part). + # + # In all cases, the length is checked in UTF-8 because the SMTPUTF8 + # extension to SMTP validates the length in bytes. + + addresses_to_check = [ + (addrinfo.original, None), + (addrinfo.normalized, "after normalization"), + ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), + ] + + for addr, reason in addresses_to_check: + addr_len = len(addr) + addr_utf8_len = len(addr.encode("utf8")) + diff = addr_utf8_len - EMAIL_MAX_LENGTH + if diff > 0: + if reason is None and addr_len == addr_utf8_len: + # If there is no normalization or transcoding, + # we can give a simple count of the number of + # characters over the limit. + reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) + elif reason is None: + # If there is no normalization but there is + # some transcoding to UTF-8, we can compute + # the minimum number of characters over the + # limit by dividing the number of bytes over + # the limit by the maximum number of bytes + # per character. + mbpc = max(len(c.encode("utf8")) for c in addr) + mchars = max(1, diff // mbpc) + suffix = "s" if diff > 1 else "" + if mchars == diff: + reason = f"({diff} character{suffix} too many)" + else: + reason = f"({mchars}-{diff} character{suffix} too many)" + else: + # Since there is normalization, the number of + # characters in the input that need to change is + # impossible to know. + suffix = "s" if diff > 1 else "" + reason += f" ({diff} byte{suffix} too many)" + raise EmailSyntaxError(f"The email address is too long {reason}.") + + +class DomainLiteralValidationResult(TypedDict): + domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + domain: str + + +def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: + # This is obscure domain-literal syntax. Parse it and return + # a compressed/normalized address. + # RFC 5321 4.1.3 and RFC 5322 3.4.1. + + addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + + # Try to parse the domain literal as an IPv4 address. + # There is no tag for IPv4 addresses, so we can never + # be sure if the user intends an IPv4 address. + if re.match(r"^[0-9\.]+$", domain_literal): + try: + addr = ipaddress.IPv4Address(domain_literal) + except ValueError as e: + raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e + + # Return the IPv4Address object and the domain back unchanged. + return { + "domain_address": addr, + "domain": f"[{addr}]", + } + + # If it begins with "IPv6:" it's an IPv6 address. + if domain_literal.startswith("IPv6:"): + try: + addr = ipaddress.IPv6Address(domain_literal[5:]) + except ValueError as e: + raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e + + # Return the IPv6Address object and construct a normalized + # domain literal. + return { + "domain_address": addr, + "domain": f"[IPv6:{addr.compressed}]", + } + + # Nothing else is valid. + + if ":" not in domain_literal: + raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") + + # The tag (the part before the colon) has character restrictions, + # but since it must come from a registry of tags (in which only "IPv6" is defined), + # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. + + # Check for permitted ASCII characters. This actually doesn't matter + # since there will be an exception after anyway. + bad_chars = { + safe_character_display(c) + for c in domain_literal + if not DOMAIN_LITERAL_CHARS.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") + + # There are no other domain literal tags. + # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml + raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.") diff --git a/.venv/lib/python3.12/site-packages/email_validator/validate_email.py b/.venv/lib/python3.12/site-packages/email_validator/validate_email.py new file mode 100644 index 00000000..a134c77d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/validate_email.py @@ -0,0 +1,180 @@ +from typing import Optional, Union, TYPE_CHECKING +import unicodedata + +from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length +from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES + +if TYPE_CHECKING: + import dns.resolver + _Resolver = dns.resolver.Resolver +else: + _Resolver = object + + +def validate_email( + email: Union[str, bytes], + /, # prior arguments are positional-only + *, # subsequent arguments are keyword-only + allow_smtputf8: Optional[bool] = None, + allow_empty_local: bool = False, + allow_quoted_local: Optional[bool] = None, + allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, + check_deliverability: Optional[bool] = None, + test_environment: Optional[bool] = None, + globally_deliverable: Optional[bool] = None, + timeout: Optional[int] = None, + dns_resolver: Optional[_Resolver] = None +) -> ValidatedEmail: + """ + Given an email address, and some options, returns a ValidatedEmail instance + with information about the address if it is valid or, if the address is not + valid, raises an EmailNotValidError. This is the main function of the module. + """ + + # Fill in default values of arguments. + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ + GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT + if allow_smtputf8 is None: + allow_smtputf8 = ALLOW_SMTPUTF8 + if allow_quoted_local is None: + allow_quoted_local = ALLOW_QUOTED_LOCAL + if allow_domain_literal is None: + allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME + if check_deliverability is None: + check_deliverability = CHECK_DELIVERABILITY + if test_environment is None: + test_environment = TEST_ENVIRONMENT + if globally_deliverable is None: + globally_deliverable = GLOBALLY_DELIVERABLE + if timeout is None and dns_resolver is None: + timeout = DEFAULT_TIMEOUT + + # Allow email to be a str or bytes instance. If bytes, + # it must be ASCII because that's how the bytes work + # on the wire with SMTP. + if not isinstance(email, str): + try: + email = email.decode("ascii") + except ValueError as e: + raise EmailSyntaxError("The email address is not valid ASCII.") from e + + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local + # part if the local part is quoted. + display_name, local_part, domain_part, is_quoted_local_part \ + = split_email(email) + + # Collect return values in this instance. + ret = ValidatedEmail() + ret.original = ((local_part if not is_quoted_local_part + else ('"' + local_part + '"')) + + "@" + domain_part) # drop the display name, if any, for email length tests at the end + ret.display_name = display_name + + # Validate the email address's local part syntax and get a normalized form. + # If the original address was quoted and the decoded local part is a valid + # unquoted local part, then we'll get back a normalized (unescaped) local + # part. + local_part_info = validate_email_local_part(local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part) + ret.local_part = local_part_info["local_part"] + ret.ascii_local_part = local_part_info["ascii_local_part"] + ret.smtputf8 = local_part_info["smtputf8"] + + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, + # so we'll return the NFC-normalized local part. Since the caller may use that + # string in place of the original string, ensure it is also valid. + normalized_local_part = unicodedata.normalize("NFC", ret.local_part) + if normalized_local_part != ret.local_part: + try: + validate_email_local_part(normalized_local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part) + except EmailSyntaxError as e: + raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e + ret.local_part = normalized_local_part + + # If a quoted local part isn't allowed but is present, now raise an exception. + # This is done after any exceptions raised by validate_email_local_part so + # that mandatory checks have highest precedence. + if is_quoted_local_part and not allow_quoted_local: + raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.") + + # Some local parts are required to be case-insensitive, so we should normalize + # to lowercase. + # RFC 2142 + if ret.ascii_local_part is not None \ + and ret.ascii_local_part.lower() in CASE_INSENSITIVE_MAILBOX_NAMES \ + and ret.local_part is not None: + ret.ascii_local_part = ret.ascii_local_part.lower() + ret.local_part = ret.local_part.lower() + + # Validate the email address's domain part syntax and get a normalized form. + is_domain_literal = False + if len(domain_part) == 0: + raise EmailSyntaxError("There must be something after the @-sign.") + + elif domain_part.startswith("[") and domain_part.endswith("]"): + # Parse the address in the domain literal and get back a normalized domain. + domain_literal_info = validate_email_domain_literal(domain_part[1:-1]) + if not allow_domain_literal: + raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") + ret.domain = domain_literal_info["domain"] + ret.ascii_domain = domain_literal_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_literal_info["domain_address"] + is_domain_literal = True # Prevent deliverability checks. + + else: + # Check the syntax of the domain and get back a normalized + # internationalized and ASCII form. + domain_name_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_name_info["domain"] + ret.ascii_domain = domain_name_info["ascii_domain"] + + # Construct the complete normalized form. + ret.normalized = ret.local_part + "@" + ret.domain + + # If the email address has an ASCII form, add it. + if not ret.smtputf8: + if not ret.ascii_domain: + raise Exception("Missing ASCII domain.") + ret.ascii_email = (ret.ascii_local_part or "") + "@" + ret.ascii_domain + else: + ret.ascii_email = None + + # Check the length of the address. + validate_email_length(ret) + + # Check that a display name is permitted. It's the last syntax check + # because we always check against optional parsing features last. + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") + + if check_deliverability and not test_environment: + # Validate the email address's deliverability using DNS + # and update the returned ValidatedEmail object with metadata. + + if is_domain_literal: + # There is nothing to check --- skip deliverability checks. + return ret + + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import validate_email_deliverability + deliverability_info = validate_email_deliverability( + ret.ascii_domain, ret.domain, timeout, dns_resolver + ) + mx = deliverability_info.get("mx") + if mx is not None: + ret.mx = mx + ret.mx_fallback_type = deliverability_info.get("mx_fallback_type") + + return ret diff --git a/.venv/lib/python3.12/site-packages/email_validator/version.py b/.venv/lib/python3.12/site-packages/email_validator/version.py new file mode 100644 index 00000000..8a124bf6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/email_validator/version.py @@ -0,0 +1 @@ +__version__ = "2.2.0" |