diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py | 614 |
1 files changed, 614 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py b/.venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py new file mode 100644 index 00000000..6128a0d3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py @@ -0,0 +1,614 @@ +#!/usr/bin/env python + + +import io as StringIO +import math +import re + +from ..metrics_core import Metric, METRIC_LABEL_NAME_RE +from ..samples import Exemplar, Sample, Timestamp +from ..utils import floatToGoString + + +def text_string_to_metric_families(text): + """Parse Openmetrics text format from a unicode string. + + See text_fd_to_metric_families. + """ + yield from text_fd_to_metric_families(StringIO.StringIO(text)) + + +_CANONICAL_NUMBERS = {float("inf")} + + +def _isUncanonicalNumber(s): + f = float(s) + if f not in _CANONICAL_NUMBERS: + return False # Only the canonical numbers are required to be canonical. + return s != floatToGoString(f) + + +ESCAPE_SEQUENCES = { + '\\\\': '\\', + '\\n': '\n', + '\\"': '"', +} + + +def _replace_escape_sequence(match): + return ESCAPE_SEQUENCES[match.group(0)] + + +ESCAPING_RE = re.compile(r'\\[\\n"]') + + +def _replace_escaping(s): + return ESCAPING_RE.sub(_replace_escape_sequence, s) + + +def _unescape_help(text): + result = [] + slash = False + + for char in text: + if slash: + if char == '\\': + result.append('\\') + elif char == '"': + result.append('"') + elif char == 'n': + result.append('\n') + else: + result.append('\\' + char) + slash = False + else: + if char == '\\': + slash = True + else: + result.append(char) + + if slash: + result.append('\\') + + return ''.join(result) + + +def _parse_value(value): + value = ''.join(value) + if value != value.strip() or '_' in value: + raise ValueError(f"Invalid value: {value!r}") + try: + return int(value) + except ValueError: + return float(value) + + +def _parse_timestamp(timestamp): + timestamp = ''.join(timestamp) + if not timestamp: + return None + if timestamp != timestamp.strip() or '_' in timestamp: + raise ValueError(f"Invalid timestamp: {timestamp!r}") + try: + # Simple int. + return Timestamp(int(timestamp), 0) + except ValueError: + try: + # aaaa.bbbb. Nanosecond resolution supported. + parts = timestamp.split('.', 1) + return Timestamp(int(parts[0]), int(parts[1][:9].ljust(9, "0"))) + except ValueError: + # Float. + ts = float(timestamp) + if math.isnan(ts) or math.isinf(ts): + raise ValueError(f"Invalid timestamp: {timestamp!r}") + return ts + + +def _is_character_escaped(s, charpos): + num_bslashes = 0 + while (charpos > num_bslashes + and s[charpos - 1 - num_bslashes] == '\\'): + num_bslashes += 1 + return num_bslashes % 2 == 1 + + +def _parse_labels_with_state_machine(text): + # The { has already been parsed. + state = 'startoflabelname' + labelname = [] + labelvalue = [] + labels = {} + labels_len = 0 + + for char in text: + if state == 'startoflabelname': + if char == '}': + state = 'endoflabels' + else: + state = 'labelname' + labelname.append(char) + elif state == 'labelname': + if char == '=': + state = 'labelvaluequote' + else: + labelname.append(char) + elif state == 'labelvaluequote': + if char == '"': + state = 'labelvalue' + else: + raise ValueError("Invalid line: " + text) + elif state == 'labelvalue': + if char == '\\': + state = 'labelvalueslash' + elif char == '"': + ln = ''.join(labelname) + if not METRIC_LABEL_NAME_RE.match(ln): + raise ValueError("Invalid line, bad label name: " + text) + if ln in labels: + raise ValueError("Invalid line, duplicate label name: " + text) + labels[ln] = ''.join(labelvalue) + labelname = [] + labelvalue = [] + state = 'endoflabelvalue' + else: + labelvalue.append(char) + elif state == 'endoflabelvalue': + if char == ',': + state = 'labelname' + elif char == '}': + state = 'endoflabels' + else: + raise ValueError("Invalid line: " + text) + elif state == 'labelvalueslash': + state = 'labelvalue' + if char == '\\': + labelvalue.append('\\') + elif char == 'n': + labelvalue.append('\n') + elif char == '"': + labelvalue.append('"') + else: + labelvalue.append('\\' + char) + elif state == 'endoflabels': + if char == ' ': + break + else: + raise ValueError("Invalid line: " + text) + labels_len += 1 + return labels, labels_len + + +def _parse_labels(text): + labels = {} + + # Raise error if we don't have valid labels + if text and "=" not in text: + raise ValueError + + # Copy original labels + sub_labels = text + try: + # Process one label at a time + while sub_labels: + # The label name is before the equal + value_start = sub_labels.index("=") + label_name = sub_labels[:value_start] + sub_labels = sub_labels[value_start + 1:] + + # Check for missing quotes + if not sub_labels or sub_labels[0] != '"': + raise ValueError + + # The first quote is guaranteed to be after the equal + value_substr = sub_labels[1:] + + # Check for extra commas + if not label_name or label_name[0] == ',': + raise ValueError + if not value_substr or value_substr[-1] == ',': + raise ValueError + + # Find the last unescaped quote + i = 0 + while i < len(value_substr): + i = value_substr.index('"', i) + if not _is_character_escaped(value_substr[:i], i): + break + i += 1 + + # The label value is between the first and last quote + quote_end = i + 1 + label_value = sub_labels[1:quote_end] + # Replace escaping if needed + if "\\" in label_value: + label_value = _replace_escaping(label_value) + if not METRIC_LABEL_NAME_RE.match(label_name): + raise ValueError("invalid line, bad label name: " + text) + if label_name in labels: + raise ValueError("invalid line, duplicate label name: " + text) + labels[label_name] = label_value + + # Remove the processed label from the sub-slice for next iteration + sub_labels = sub_labels[quote_end + 1:] + if sub_labels.startswith(","): + next_comma = 1 + else: + next_comma = 0 + sub_labels = sub_labels[next_comma:] + + # Check for missing commas + if sub_labels and next_comma == 0: + raise ValueError + + return labels + + except ValueError: + raise ValueError("Invalid labels: " + text) + + +def _parse_sample(text): + separator = " # " + # Detect the labels in the text + label_start = text.find("{") + if label_start == -1 or separator in text[:label_start]: + # We don't have labels, but there could be an exemplar. + name_end = text.index(" ") + name = text[:name_end] + # Parse the remaining text after the name + remaining_text = text[name_end + 1:] + value, timestamp, exemplar = _parse_remaining_text(remaining_text) + return Sample(name, {}, value, timestamp, exemplar) + # The name is before the labels + name = text[:label_start] + if separator not in text: + # Line doesn't contain an exemplar + # We can use `rindex` to find `label_end` + label_end = text.rindex("}") + label = text[label_start + 1:label_end] + labels = _parse_labels(label) + else: + # Line potentially contains an exemplar + # Fallback to parsing labels with a state machine + labels, labels_len = _parse_labels_with_state_machine(text[label_start + 1:]) + label_end = labels_len + len(name) + # Parsing labels succeeded, continue parsing the remaining text + remaining_text = text[label_end + 2:] + value, timestamp, exemplar = _parse_remaining_text(remaining_text) + return Sample(name, labels, value, timestamp, exemplar) + + +def _parse_remaining_text(text): + split_text = text.split(" ", 1) + val = _parse_value(split_text[0]) + if len(split_text) == 1: + # We don't have timestamp or exemplar + return val, None, None + + timestamp = [] + exemplar_value = [] + exemplar_timestamp = [] + exemplar_labels = None + + state = 'timestamp' + text = split_text[1] + + it = iter(text) + for char in it: + if state == 'timestamp': + if char == '#' and not timestamp: + state = 'exemplarspace' + elif char == ' ': + state = 'exemplarhash' + else: + timestamp.append(char) + elif state == 'exemplarhash': + if char == '#': + state = 'exemplarspace' + else: + raise ValueError("Invalid line: " + text) + elif state == 'exemplarspace': + if char == ' ': + state = 'exemplarstartoflabels' + else: + raise ValueError("Invalid line: " + text) + elif state == 'exemplarstartoflabels': + if char == '{': + label_start, label_end = text.index("{"), text.rindex("}") + exemplar_labels = _parse_labels(text[label_start + 1:label_end]) + state = 'exemplarparsedlabels' + else: + raise ValueError("Invalid line: " + text) + elif state == 'exemplarparsedlabels': + if char == '}': + state = 'exemplarvaluespace' + elif state == 'exemplarvaluespace': + if char == ' ': + state = 'exemplarvalue' + else: + raise ValueError("Invalid line: " + text) + elif state == 'exemplarvalue': + if char == ' ' and not exemplar_value: + raise ValueError("Invalid line: " + text) + elif char == ' ': + state = 'exemplartimestamp' + else: + exemplar_value.append(char) + elif state == 'exemplartimestamp': + exemplar_timestamp.append(char) + + # Trailing space after value. + if state == 'timestamp' and not timestamp: + raise ValueError("Invalid line: " + text) + + # Trailing space after value. + if state == 'exemplartimestamp' and not exemplar_timestamp: + raise ValueError("Invalid line: " + text) + + # Incomplete exemplar. + if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels', 'exemplarparsedlabels']: + raise ValueError("Invalid line: " + text) + + ts = _parse_timestamp(timestamp) + exemplar = None + if exemplar_labels is not None: + exemplar_length = sum(len(k) + len(v) for k, v in exemplar_labels.items()) + if exemplar_length > 128: + raise ValueError("Exemplar labels are too long: " + text) + exemplar = Exemplar( + exemplar_labels, + _parse_value(exemplar_value), + _parse_timestamp(exemplar_timestamp), + ) + + return val, ts, exemplar + + +def _group_for_sample(sample, name, typ): + if typ == 'info': + # We can't distinguish between groups for info metrics. + return {} + if typ == 'summary' and sample.name == name: + d = sample.labels.copy() + del d['quantile'] + return d + if typ == 'stateset': + d = sample.labels.copy() + del d[name] + return d + if typ in ['histogram', 'gaugehistogram'] and sample.name == name + '_bucket': + d = sample.labels.copy() + del d['le'] + return d + return sample.labels + + +def _check_histogram(samples, name): + group = None + timestamp = None + + def do_checks(): + if bucket != float('+Inf'): + raise ValueError("+Inf bucket missing: " + name) + if count is not None and value != count: + raise ValueError("Count does not match +Inf value: " + name) + if has_sum and count is None: + raise ValueError("_count must be present if _sum is present: " + name) + if has_gsum and count is None: + raise ValueError("_gcount must be present if _gsum is present: " + name) + if not (has_sum or has_gsum) and count is not None: + raise ValueError("_sum/_gsum must be present if _count is present: " + name) + if has_negative_buckets and has_sum: + raise ValueError("Cannot have _sum with negative buckets: " + name) + if not has_negative_buckets and has_negative_gsum: + raise ValueError("Cannot have negative _gsum with non-negative buckets: " + name) + + for s in samples: + suffix = s.name[len(name):] + g = _group_for_sample(s, name, 'histogram') + if g != group or s.timestamp != timestamp: + if group is not None: + do_checks() + count = None + bucket = None + has_negative_buckets = False + has_sum = False + has_gsum = False + has_negative_gsum = False + value = 0 + group = g + timestamp = s.timestamp + + if suffix == '_bucket': + b = float(s.labels['le']) + if b < 0: + has_negative_buckets = True + if bucket is not None and b <= bucket: + raise ValueError("Buckets out of order: " + name) + if s.value < value: + raise ValueError("Bucket values out of order: " + name) + bucket = b + value = s.value + elif suffix in ['_count', '_gcount']: + count = s.value + elif suffix in ['_sum']: + has_sum = True + elif suffix in ['_gsum']: + has_gsum = True + if s.value < 0: + has_negative_gsum = True + + if group is not None: + do_checks() + + +def text_fd_to_metric_families(fd): + """Parse Prometheus text format from a file descriptor. + + This is a laxer parser than the main Go parser, + so successful parsing does not imply that the parsed + text meets the specification. + + Yields Metric's. + """ + name = None + allowed_names = [] + eof = False + + seen_names = set() + type_suffixes = { + 'counter': ['_total', '_created'], + 'summary': ['', '_count', '_sum', '_created'], + 'histogram': ['_count', '_sum', '_bucket', '_created'], + 'gaugehistogram': ['_gcount', '_gsum', '_bucket'], + 'info': ['_info'], + } + + def build_metric(name, documentation, typ, unit, samples): + if typ is None: + typ = 'unknown' + for suffix in set(type_suffixes.get(typ, []) + [""]): + if name + suffix in seen_names: + raise ValueError("Clashing name: " + name + suffix) + seen_names.add(name + suffix) + if documentation is None: + documentation = '' + if unit is None: + unit = '' + if unit and not name.endswith("_" + unit): + raise ValueError("Unit does not match metric name: " + name) + if unit and typ in ['info', 'stateset']: + raise ValueError("Units not allowed for this metric type: " + name) + if typ in ['histogram', 'gaugehistogram']: + _check_histogram(samples, name) + metric = Metric(name, documentation, typ, unit) + # TODO: check labelvalues are valid utf8 + metric.samples = samples + return metric + + for line in fd: + if line[-1] == '\n': + line = line[:-1] + + if eof: + raise ValueError("Received line after # EOF: " + line) + + if not line: + raise ValueError("Received blank line") + + if line == '# EOF': + eof = True + elif line.startswith('#'): + parts = line.split(' ', 3) + if len(parts) < 4: + raise ValueError("Invalid line: " + line) + if parts[2] == name and samples: + raise ValueError("Received metadata after samples: " + line) + if parts[2] != name: + if name is not None: + yield build_metric(name, documentation, typ, unit, samples) + # New metric + name = parts[2] + unit = None + typ = None + documentation = None + group = None + seen_groups = set() + group_timestamp = None + group_timestamp_samples = set() + samples = [] + allowed_names = [parts[2]] + + if parts[1] == 'HELP': + if documentation is not None: + raise ValueError("More than one HELP for metric: " + line) + documentation = _unescape_help(parts[3]) + elif parts[1] == 'TYPE': + if typ is not None: + raise ValueError("More than one TYPE for metric: " + line) + typ = parts[3] + if typ == 'untyped': + raise ValueError("Invalid TYPE for metric: " + line) + allowed_names = [name + n for n in type_suffixes.get(typ, [''])] + elif parts[1] == 'UNIT': + if unit is not None: + raise ValueError("More than one UNIT for metric: " + line) + unit = parts[3] + else: + raise ValueError("Invalid line: " + line) + else: + sample = _parse_sample(line) + if sample.name not in allowed_names: + if name is not None: + yield build_metric(name, documentation, typ, unit, samples) + # Start an unknown metric. + name = sample.name + documentation = None + unit = None + typ = 'unknown' + samples = [] + group = None + group_timestamp = None + group_timestamp_samples = set() + seen_groups = set() + allowed_names = [sample.name] + + if typ == 'stateset' and name not in sample.labels: + raise ValueError("Stateset missing label: " + line) + if (name + '_bucket' == sample.name + and (sample.labels.get('le', "NaN") == "NaN" + or _isUncanonicalNumber(sample.labels['le']))): + raise ValueError("Invalid le label: " + line) + if (name + '_bucket' == sample.name + and (not isinstance(sample.value, int) and not sample.value.is_integer())): + raise ValueError("Bucket value must be an integer: " + line) + if ((name + '_count' == sample.name or name + '_gcount' == sample.name) + and (not isinstance(sample.value, int) and not sample.value.is_integer())): + raise ValueError("Count value must be an integer: " + line) + if (typ == 'summary' and name == sample.name + and (not (0 <= float(sample.labels.get('quantile', -1)) <= 1) + or _isUncanonicalNumber(sample.labels['quantile']))): + raise ValueError("Invalid quantile label: " + line) + + g = tuple(sorted(_group_for_sample(sample, name, typ).items())) + if group is not None and g != group and g in seen_groups: + raise ValueError("Invalid metric grouping: " + line) + if group is not None and g == group: + if (sample.timestamp is None) != (group_timestamp is None): + raise ValueError("Mix of timestamp presence within a group: " + line) + if group_timestamp is not None and group_timestamp > sample.timestamp and typ != 'info': + raise ValueError("Timestamps went backwards within a group: " + line) + else: + group_timestamp_samples = set() + + series_id = (sample.name, tuple(sorted(sample.labels.items()))) + if sample.timestamp != group_timestamp or series_id not in group_timestamp_samples: + # Not a duplicate due to timestamp truncation. + samples.append(sample) + group_timestamp_samples.add(series_id) + + group = g + group_timestamp = sample.timestamp + seen_groups.add(g) + + if typ == 'stateset' and sample.value not in [0, 1]: + raise ValueError("Stateset samples can only have values zero and one: " + line) + if typ == 'info' and sample.value != 1: + raise ValueError("Info samples can only have value one: " + line) + if typ == 'summary' and name == sample.name and sample.value < 0: + raise ValueError("Quantile values cannot be negative: " + line) + if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount', '_gsum'] and math.isnan( + sample.value): + raise ValueError("Counter-like samples cannot be NaN: " + line) + if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount'] and sample.value < 0: + raise ValueError("Counter-like samples cannot be negative: " + line) + if sample.exemplar and not ( + (typ in ['histogram', 'gaugehistogram'] and sample.name.endswith('_bucket')) + or (typ in ['counter'] and sample.name.endswith('_total'))): + raise ValueError("Invalid line only histogram/gaugehistogram buckets and counters can have exemplars: " + line) + + if name is not None: + yield build_metric(name, documentation, typ, unit, samples) + + if not eof: + raise ValueError("Missing # EOF at end") |