misc.python.materialize.github

GitHub utilities.

  1# Copyright Materialize, Inc. and contributors. All rights reserved.
  2#
  3# Use of this software is governed by the Business Source License
  4# included in the LICENSE file at the root of this repository.
  5#
  6# As of the Change Date specified in that file, in accordance with
  7# the Business Source License, use of this software will be governed
  8# by the Apache License, Version 2.0.
  9
 10"""GitHub utilities."""
 11
 12import os
 13import re
 14from dataclasses import dataclass
 15from typing import Any
 16
 17import requests
 18
 19from materialize.observed_error import ObservedBaseError, WithIssue
 20
 21CI_RE = re.compile("ci-regexp: (.*)")
 22CI_APPLY_TO = re.compile("ci-apply-to: (.*)")
 23
 24
 25@dataclass
 26class KnownGitHubIssue:
 27    regex: re.Pattern[Any]
 28    apply_to: str | None
 29    info: dict[str, Any]
 30
 31
 32@dataclass(kw_only=True, unsafe_hash=True)
 33class GitHubIssueWithInvalidRegexp(ObservedBaseError, WithIssue):
 34    regex_pattern: str
 35
 36    def to_text(self) -> str:
 37        return f"Invalid regex in ci-regexp: {self.regex_pattern}"
 38
 39    def to_markdown(self) -> str:
 40        return f'<a href="{self.issue_url}">{self.issue_title} (#{self.issue_number})</a>: Invalid regex in ci-regexp: {self.regex_pattern}, ignoring'
 41
 42
 43def get_known_issues_from_github_page(
 44    token: str | None, repo: str, page: int = 1
 45) -> Any:
 46    headers = {
 47        "Accept": "application/vnd.github+json",
 48        "X-GitHub-Api-Version": "2022-11-28",
 49    }
 50    if token:
 51        headers["Authorization"] = f"Bearer {token}"
 52
 53    response = requests.get(
 54        f'https://api.github.com/search/issues?q=repo:{repo}%20type:issue%20in:body%20"ci-regexp%3A"&per_page=100&page={page}',
 55        headers=headers,
 56    )
 57
 58    if response.status_code != 200:
 59        raise ValueError(f"Bad return code from GitHub: {response.status_code}")
 60
 61    issues_json = response.json()
 62    assert issues_json["incomplete_results"] == False
 63    return issues_json
 64
 65
 66def get_known_issues_from_github(
 67    token: str | None = os.getenv("GITHUB_TOKEN"),
 68    repo: str = "MaterializeInc/database-issues",
 69) -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]:
 70    page = 1
 71    issues_json = get_known_issues_from_github_page(token, repo, page)
 72    while issues_json["total_count"] > len(issues_json["items"]):
 73        page += 1
 74        next_page_json = get_known_issues_from_github_page(token, repo, page)
 75        if not next_page_json["items"]:
 76            break
 77        issues_json["items"].extend(next_page_json["items"])
 78
 79    known_issues = []
 80    issues_with_invalid_regex = []
 81
 82    for issue in issues_json["items"]:
 83        matches = CI_RE.findall(issue["body"])
 84        matches_apply_to = CI_APPLY_TO.findall(issue["body"])
 85
 86        if len(matches) > 1:
 87            issues_with_invalid_regex.append(
 88                GitHubIssueWithInvalidRegexp(
 89                    internal_error_type="GITHUB_INVALID_REGEXP",
 90                    issue_url=issue["html_url"],
 91                    issue_title=issue["title"],
 92                    issue_number=issue["number"],
 93                    regex_pattern=f"Multiple regexes, but only one supported: {[match.strip() for match in matches]}",
 94                )
 95            )
 96            continue
 97
 98        if len(matches) == 0:
 99            continue
100
101        try:
102            regex_pattern = re.compile(matches[0].strip().encode())
103        except:
104            issues_with_invalid_regex.append(
105                GitHubIssueWithInvalidRegexp(
106                    internal_error_type="GITHUB_INVALID_REGEXP",
107                    issue_url=issue["html_url"],
108                    issue_title=issue["title"],
109                    issue_number=issue["number"],
110                    regex_pattern=matches[0].strip(),
111                )
112            )
113            continue
114
115        if matches_apply_to:
116            for match_apply_to in matches_apply_to:
117                known_issues.append(
118                    KnownGitHubIssue(
119                        regex_pattern, match_apply_to.strip().lower(), issue
120                    )
121                )
122        else:
123            known_issues.append(KnownGitHubIssue(regex_pattern, None, issue))
124
125    return (known_issues, issues_with_invalid_regex)
126
127
128def for_github_re(text: bytes) -> bytes:
129    """
130    Matching newlines in regular expressions is kind of annoying, don't expect
131    ci-regexp to do that correctly, but instead replace all newlines with a
132    space. For examples this makes matching this panic easier:
133
134      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5:
135      assertion `left == right` failed
136
137    Previously the regex should have been:
138      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*\n.*left == right
139
140    With this function it can be:
141      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right
142    """
143    return text.replace(b"\n", b" ")
CI_RE = re.compile('ci-regexp: (.*)')
CI_APPLY_TO = re.compile('ci-apply-to: (.*)')
@dataclass
class KnownGitHubIssue:
26@dataclass
27class KnownGitHubIssue:
28    regex: re.Pattern[Any]
29    apply_to: str | None
30    info: dict[str, Any]
KnownGitHubIssue( regex: re.Pattern[typing.Any], apply_to: str | None, info: dict[str, typing.Any])
regex: re.Pattern[typing.Any]
apply_to: str | None
info: dict[str, typing.Any]
@dataclass(kw_only=True, unsafe_hash=True)
class GitHubIssueWithInvalidRegexp(materialize.observed_error.ObservedBaseError, materialize.observed_error.WithIssue):
33@dataclass(kw_only=True, unsafe_hash=True)
34class GitHubIssueWithInvalidRegexp(ObservedBaseError, WithIssue):
35    regex_pattern: str
36
37    def to_text(self) -> str:
38        return f"Invalid regex in ci-regexp: {self.regex_pattern}"
39
40    def to_markdown(self) -> str:
41        return f'<a href="{self.issue_url}">{self.issue_title} (#{self.issue_number})</a>: Invalid regex in ci-regexp: {self.regex_pattern}, ignoring'
GitHubIssueWithInvalidRegexp( issue_number: int, issue_url: str, issue_title: str, internal_error_type: str, *, regex_pattern: str)
regex_pattern: str
def to_text(self) -> str:
37    def to_text(self) -> str:
38        return f"Invalid regex in ci-regexp: {self.regex_pattern}"
def to_markdown(self) -> str:
40    def to_markdown(self) -> str:
41        return f'<a href="{self.issue_url}">{self.issue_title} (#{self.issue_number})</a>: Invalid regex in ci-regexp: {self.regex_pattern}, ignoring'
def get_known_issues_from_github_page(token: str | None, repo: str, page: int = 1) -> Any:
44def get_known_issues_from_github_page(
45    token: str | None, repo: str, page: int = 1
46) -> Any:
47    headers = {
48        "Accept": "application/vnd.github+json",
49        "X-GitHub-Api-Version": "2022-11-28",
50    }
51    if token:
52        headers["Authorization"] = f"Bearer {token}"
53
54    response = requests.get(
55        f'https://api.github.com/search/issues?q=repo:{repo}%20type:issue%20in:body%20"ci-regexp%3A"&per_page=100&page={page}',
56        headers=headers,
57    )
58
59    if response.status_code != 200:
60        raise ValueError(f"Bad return code from GitHub: {response.status_code}")
61
62    issues_json = response.json()
63    assert issues_json["incomplete_results"] == False
64    return issues_json
def get_known_issues_from_github( token: str | None = None, repo: str = 'MaterializeInc/database-issues') -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]:
 67def get_known_issues_from_github(
 68    token: str | None = os.getenv("GITHUB_TOKEN"),
 69    repo: str = "MaterializeInc/database-issues",
 70) -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]:
 71    page = 1
 72    issues_json = get_known_issues_from_github_page(token, repo, page)
 73    while issues_json["total_count"] > len(issues_json["items"]):
 74        page += 1
 75        next_page_json = get_known_issues_from_github_page(token, repo, page)
 76        if not next_page_json["items"]:
 77            break
 78        issues_json["items"].extend(next_page_json["items"])
 79
 80    known_issues = []
 81    issues_with_invalid_regex = []
 82
 83    for issue in issues_json["items"]:
 84        matches = CI_RE.findall(issue["body"])
 85        matches_apply_to = CI_APPLY_TO.findall(issue["body"])
 86
 87        if len(matches) > 1:
 88            issues_with_invalid_regex.append(
 89                GitHubIssueWithInvalidRegexp(
 90                    internal_error_type="GITHUB_INVALID_REGEXP",
 91                    issue_url=issue["html_url"],
 92                    issue_title=issue["title"],
 93                    issue_number=issue["number"],
 94                    regex_pattern=f"Multiple regexes, but only one supported: {[match.strip() for match in matches]}",
 95                )
 96            )
 97            continue
 98
 99        if len(matches) == 0:
100            continue
101
102        try:
103            regex_pattern = re.compile(matches[0].strip().encode())
104        except:
105            issues_with_invalid_regex.append(
106                GitHubIssueWithInvalidRegexp(
107                    internal_error_type="GITHUB_INVALID_REGEXP",
108                    issue_url=issue["html_url"],
109                    issue_title=issue["title"],
110                    issue_number=issue["number"],
111                    regex_pattern=matches[0].strip(),
112                )
113            )
114            continue
115
116        if matches_apply_to:
117            for match_apply_to in matches_apply_to:
118                known_issues.append(
119                    KnownGitHubIssue(
120                        regex_pattern, match_apply_to.strip().lower(), issue
121                    )
122                )
123        else:
124            known_issues.append(KnownGitHubIssue(regex_pattern, None, issue))
125
126    return (known_issues, issues_with_invalid_regex)
def for_github_re(text: bytes) -> bytes:
129def for_github_re(text: bytes) -> bytes:
130    """
131    Matching newlines in regular expressions is kind of annoying, don't expect
132    ci-regexp to do that correctly, but instead replace all newlines with a
133    space. For examples this makes matching this panic easier:
134
135      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5:
136      assertion `left == right` failed
137
138    Previously the regex should have been:
139      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*\n.*left == right
140
141    With this function it can be:
142      thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right
143    """
144    return text.replace(b"\n", b" ")

Matching newlines in regular expressions is kind of annoying, don't expect ci-regexp to do that correctly, but instead replace all newlines with a space. For examples this makes matching this panic easier:

  thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5:
  assertion `left == right` failed

Previously the regex should have been:
  thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*

.*left == right

With this function it can be:
  thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right