misc.python.materialize.github
GitHub utilities.
1# Copyright Materialize, Inc. and contributors. All rights reserved. 2# 3# Use of this software is governed by the Business Source License 4# included in the LICENSE file at the root of this repository. 5# 6# As of the Change Date specified in that file, in accordance with 7# the Business Source License, use of this software will be governed 8# by the Apache License, Version 2.0. 9 10"""GitHub utilities.""" 11 12import os 13import re 14from dataclasses import dataclass 15from typing import Any 16 17import requests 18 19from materialize.observed_error import ObservedBaseError, WithIssue 20 21CI_RE = re.compile("ci-regexp: (.*)") 22CI_APPLY_TO = re.compile("ci-apply-to: (.*)") 23 24 25@dataclass 26class KnownGitHubIssue: 27 regex: re.Pattern[Any] 28 apply_to: str | None 29 info: dict[str, Any] 30 31 32@dataclass(kw_only=True, unsafe_hash=True) 33class GitHubIssueWithInvalidRegexp(ObservedBaseError, WithIssue): 34 regex_pattern: str 35 36 def to_text(self) -> str: 37 return f"Invalid regex in ci-regexp: {self.regex_pattern}" 38 39 def to_markdown(self) -> str: 40 return f'<a href="{self.issue_url}">{self.issue_title} (#{self.issue_number})</a>: Invalid regex in ci-regexp: {self.regex_pattern}, ignoring' 41 42 43def get_known_issues_from_github_page( 44 token: str | None, repo: str, page: int = 1 45) -> Any: 46 headers = { 47 "Accept": "application/vnd.github+json", 48 "X-GitHub-Api-Version": "2022-11-28", 49 } 50 if token: 51 headers["Authorization"] = f"Bearer {token}" 52 53 response = requests.get( 54 f'https://api.github.com/search/issues?q=repo:{repo}%20type:issue%20in:body%20"ci-regexp%3A"&per_page=100&page={page}', 55 headers=headers, 56 ) 57 58 if response.status_code != 200: 59 raise ValueError(f"Bad return code from GitHub: {response.status_code}") 60 61 issues_json = response.json() 62 assert issues_json["incomplete_results"] == False 63 return issues_json 64 65 66def get_known_issues_from_github( 67 token: str | None = os.getenv("GITHUB_TOKEN"), 68 repo: str = "MaterializeInc/database-issues", 69) -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]: 70 page = 1 71 issues_json = get_known_issues_from_github_page(token, repo, page) 72 while issues_json["total_count"] > len(issues_json["items"]): 73 page += 1 74 next_page_json = get_known_issues_from_github_page(token, repo, page) 75 if not next_page_json["items"]: 76 break 77 issues_json["items"].extend(next_page_json["items"]) 78 79 known_issues = [] 80 issues_with_invalid_regex = [] 81 82 for issue in issues_json["items"]: 83 matches = CI_RE.findall(issue["body"]) 84 matches_apply_to = CI_APPLY_TO.findall(issue["body"]) 85 86 if len(matches) > 1: 87 issues_with_invalid_regex.append( 88 GitHubIssueWithInvalidRegexp( 89 internal_error_type="GITHUB_INVALID_REGEXP", 90 issue_url=issue["html_url"], 91 issue_title=issue["title"], 92 issue_number=issue["number"], 93 regex_pattern=f"Multiple regexes, but only one supported: {[match.strip() for match in matches]}", 94 ) 95 ) 96 continue 97 98 if len(matches) == 0: 99 continue 100 101 try: 102 regex_pattern = re.compile(matches[0].strip().encode()) 103 except: 104 issues_with_invalid_regex.append( 105 GitHubIssueWithInvalidRegexp( 106 internal_error_type="GITHUB_INVALID_REGEXP", 107 issue_url=issue["html_url"], 108 issue_title=issue["title"], 109 issue_number=issue["number"], 110 regex_pattern=matches[0].strip(), 111 ) 112 ) 113 continue 114 115 if matches_apply_to: 116 for match_apply_to in matches_apply_to: 117 known_issues.append( 118 KnownGitHubIssue( 119 regex_pattern, match_apply_to.strip().lower(), issue 120 ) 121 ) 122 else: 123 known_issues.append(KnownGitHubIssue(regex_pattern, None, issue)) 124 125 return (known_issues, issues_with_invalid_regex) 126 127 128def for_github_re(text: bytes) -> bytes: 129 """ 130 Matching newlines in regular expressions is kind of annoying, don't expect 131 ci-regexp to do that correctly, but instead replace all newlines with a 132 space. For examples this makes matching this panic easier: 133 134 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5: 135 assertion `left == right` failed 136 137 Previously the regex should have been: 138 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*\n.*left == right 139 140 With this function it can be: 141 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right 142 """ 143 return text.replace(b"\n", b" ")
CI_RE =
re.compile('ci-regexp: (.*)')
CI_APPLY_TO =
re.compile('ci-apply-to: (.*)')
@dataclass
class
KnownGitHubIssue:
@dataclass(kw_only=True, unsafe_hash=True)
class
GitHubIssueWithInvalidRegexp33@dataclass(kw_only=True, unsafe_hash=True) 34class GitHubIssueWithInvalidRegexp(ObservedBaseError, WithIssue): 35 regex_pattern: str 36 37 def to_text(self) -> str: 38 return f"Invalid regex in ci-regexp: {self.regex_pattern}" 39 40 def to_markdown(self) -> str: 41 return f'<a href="{self.issue_url}">{self.issue_title} (#{self.issue_number})</a>: Invalid regex in ci-regexp: {self.regex_pattern}, ignoring'
GitHubIssueWithInvalidRegexp( issue_number: int, issue_url: str, issue_title: str, internal_error_type: str, *, regex_pattern: str)
def
get_known_issues_from_github_page(token: str | None, repo: str, page: int = 1) -> Any:
44def get_known_issues_from_github_page( 45 token: str | None, repo: str, page: int = 1 46) -> Any: 47 headers = { 48 "Accept": "application/vnd.github+json", 49 "X-GitHub-Api-Version": "2022-11-28", 50 } 51 if token: 52 headers["Authorization"] = f"Bearer {token}" 53 54 response = requests.get( 55 f'https://api.github.com/search/issues?q=repo:{repo}%20type:issue%20in:body%20"ci-regexp%3A"&per_page=100&page={page}', 56 headers=headers, 57 ) 58 59 if response.status_code != 200: 60 raise ValueError(f"Bad return code from GitHub: {response.status_code}") 61 62 issues_json = response.json() 63 assert issues_json["incomplete_results"] == False 64 return issues_json
def
get_known_issues_from_github( token: str | None = None, repo: str = 'MaterializeInc/database-issues') -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]:
67def get_known_issues_from_github( 68 token: str | None = os.getenv("GITHUB_TOKEN"), 69 repo: str = "MaterializeInc/database-issues", 70) -> tuple[list[KnownGitHubIssue], list[GitHubIssueWithInvalidRegexp]]: 71 page = 1 72 issues_json = get_known_issues_from_github_page(token, repo, page) 73 while issues_json["total_count"] > len(issues_json["items"]): 74 page += 1 75 next_page_json = get_known_issues_from_github_page(token, repo, page) 76 if not next_page_json["items"]: 77 break 78 issues_json["items"].extend(next_page_json["items"]) 79 80 known_issues = [] 81 issues_with_invalid_regex = [] 82 83 for issue in issues_json["items"]: 84 matches = CI_RE.findall(issue["body"]) 85 matches_apply_to = CI_APPLY_TO.findall(issue["body"]) 86 87 if len(matches) > 1: 88 issues_with_invalid_regex.append( 89 GitHubIssueWithInvalidRegexp( 90 internal_error_type="GITHUB_INVALID_REGEXP", 91 issue_url=issue["html_url"], 92 issue_title=issue["title"], 93 issue_number=issue["number"], 94 regex_pattern=f"Multiple regexes, but only one supported: {[match.strip() for match in matches]}", 95 ) 96 ) 97 continue 98 99 if len(matches) == 0: 100 continue 101 102 try: 103 regex_pattern = re.compile(matches[0].strip().encode()) 104 except: 105 issues_with_invalid_regex.append( 106 GitHubIssueWithInvalidRegexp( 107 internal_error_type="GITHUB_INVALID_REGEXP", 108 issue_url=issue["html_url"], 109 issue_title=issue["title"], 110 issue_number=issue["number"], 111 regex_pattern=matches[0].strip(), 112 ) 113 ) 114 continue 115 116 if matches_apply_to: 117 for match_apply_to in matches_apply_to: 118 known_issues.append( 119 KnownGitHubIssue( 120 regex_pattern, match_apply_to.strip().lower(), issue 121 ) 122 ) 123 else: 124 known_issues.append(KnownGitHubIssue(regex_pattern, None, issue)) 125 126 return (known_issues, issues_with_invalid_regex)
def
for_github_re(text: bytes) -> bytes:
129def for_github_re(text: bytes) -> bytes: 130 """ 131 Matching newlines in regular expressions is kind of annoying, don't expect 132 ci-regexp to do that correctly, but instead replace all newlines with a 133 space. For examples this makes matching this panic easier: 134 135 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5: 136 assertion `left == right` failed 137 138 Previously the regex should have been: 139 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*\n.*left == right 140 141 With this function it can be: 142 thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right 143 """ 144 return text.replace(b"\n", b" ")
Matching newlines in regular expressions is kind of annoying, don't expect ci-regexp to do that correctly, but instead replace all newlines with a space. For examples this makes matching this panic easier:
thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs:1878:5:
assertion `left == right` failed
Previously the regex should have been:
thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*
.*left == right
With this function it can be:
thread 'test_auth_deduplication' panicked at src/environmentd/tests/auth.rs.*left == right