From 012e3b1aca11060179e994707ea019a70b69b1d5 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 3 Dec 2024 09:20:02 +0100 Subject: [PATCH 1/5] start improving graphql query so that all collaborators of all repos are fetched at once This should fix hitting the secondary limits Missing: * GraphQL limits may be hit for the 100 repos x 100 collaborators. Investigate * If more than 100 collaborators, a routine is missing * Refactoring, the function is HUGE --- gh_org_mgr/_gh_org.py | 108 ++++++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/gh_org_mgr/_gh_org.py b/gh_org_mgr/_gh_org.py index ae5a87d..8caa435 100644 --- a/gh_org_mgr/_gh_org.py +++ b/gh_org_mgr/_gh_org.py @@ -946,59 +946,107 @@ def _convert_graphql_perm_to_rest(self, permission: str) -> str: return permission - def _fetch_collaborators_of_repo(self, repo: Repository): - """Get all collaborators (individuals) of a GitHub repo with their - permissions using the GraphQL API""" - # TODO: Consider doing this for all repositories at once, but calculate - # costs beforehand + def _fetch_collaborators_of_all_organization_repos(self): + """Get all collaborators (individuals) of all repos of a GitHub + organization with their permissions using the GraphQL API""" + graphql_query = """ - query($owner: String!, $name: String!, $cursor: String) { - repository(owner: $owner, name: $name) { - collaborators(first: 100, after: $cursor) { + query($owner: String!, $cursor: String) { + organization(login: $owner) { + repositories(first: 100, after: $cursor) { edges { node { - login + name + collaborators(first: 100) { + edges { + node { + login + } + permission + } + pageInfo { + endCursor + hasNextPage + } + } } - permission } pageInfo { endCursor hasNextPage } + } } } - } """ # Initial query parameters - variables = {"owner": self.org.login, "name": repo.name, "cursor": None} + variables = {"owner": self.org.login, "cursor": None} - collaborators = [] - has_next_page = True + repo_collaborators: dict[str, dict] = {} + missing_data_for_repos = {} + more_repos = True - while has_next_page: - logging.debug("Requesting collaborators for %s", repo.name) + while more_repos: + logging.debug("Requesting collaborators for %s", self.org.login) result = run_graphql_query(graphql_query, variables, self.gh_token) try: - collaborators.extend(result["data"]["repository"]["collaborators"]["edges"]) - has_next_page = result["data"]["repository"]["collaborators"]["pageInfo"][ + for repo in result["data"]["organization"]["repositories"]["edges"]: + try: + repo_name = repo["node"]["name"] + logging.debug( + "Extracting collaborators for %s from GraphQL response", repo_name + ) + except KeyError: + logging.error( + "Did not find a repo name in the GraphQL response which " + "seems to hint to a bug: %s", + repo, + ) + sys.exit(1) + + # fill in collaborators of repo + collaborators = repo["node"]["collaborators"]["edges"] + repo_collaborators[repo_name] = collaborators + + # Find out if there are more than 100 collaborators in the + # GraphQL response for this repo + if repo["node"]["collaborators"]["pageInfo"]["hasNextPage"]: + missing_data_for_repos[repo_name] = repo["node"]["collaborators"][ + "pageInfo" + ]["endCursor"] + + # Find out if there are more than 100 repos in the GraphQL + # response. If so, get cursor + more_repos = result["data"]["organization"]["repositories"]["pageInfo"][ "hasNextPage" ] - variables["cursor"] = result["data"]["repository"]["collaborators"]["pageInfo"][ + variables["cursor"] = result["data"]["organization"]["repositories"]["pageInfo"][ "endCursor" ] except (TypeError, KeyError): - logging.debug("Repo %s does not seem to have any collaborators", repo.name) + logging.debug("Repo %s does not seem to have any collaborators", repo_name) continue - # Extract relevant data - for collaborator in collaborators: - login: str = collaborator["node"]["login"] - # Skip entry if collaborator is org owner, which is "admin" anyway - if login.lower() in [user.login.lower() for user in self.current_org_owners]: - continue - permission = self._convert_graphql_perm_to_rest(collaborator["permission"]) - self.current_repos_collaborators[repo][login.lower()] = permission + if missing_data_for_repos: + logging.warning( + "Not all collaborators of all repos have been fetched. Missing data: %s", + missing_data_for_repos, + ) + # TODO: Need to make individual graphql queries for these repos + + # Iterate repos, and fill self.current_repos_collaborators[repo] with + # collaborators as fetched from GraphQL and put into repo_collaborators + for repo, collaborators in self.current_repos_collaborators.items(): + if repo.name in repo_collaborators: + # Extract each collaborator from the GraphQL response for this repo + for collaborator in repo_collaborators[repo.name]: + login: str = collaborator["node"]["login"] + # Skip entry if collaborator is org owner, which is "admin" anyway + if login.lower() in [user.login.lower() for user in self.current_org_owners]: + continue + permission = self._convert_graphql_perm_to_rest(collaborator["permission"]) + collaborators[login.lower()] = permission def _get_current_repos_and_user_perms(self): """Get all repos, their current collaborators and their permissions""" @@ -1006,9 +1054,7 @@ def _get_current_repos_and_user_perms(self): for repo in self.current_repos_teams: self.current_repos_collaborators[repo] = {} - for repo in self.current_repos_collaborators: - # Get users for this repo - self._fetch_collaborators_of_repo(repo) + self._fetch_collaborators_of_all_organization_repos() def _get_default_repository_permission(self): """Get the default repository permission for all users. Convert to From 5f543f1b09ce987640407bad798f5ccd283a0424 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 3 Dec 2024 09:42:43 +0100 Subject: [PATCH 2/5] censor app private key from debug output --- gh_org_mgr/_gh_org.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gh_org_mgr/_gh_org.py b/gh_org_mgr/_gh_org.py index 8caa435..137c496 100644 --- a/gh_org_mgr/_gh_org.py +++ b/gh_org_mgr/_gh_org.py @@ -115,7 +115,7 @@ def censor_half_string(string: str) -> str: half2 = len(string) - half1 return string[:half1] + "*" * (half2) - sensible_keys = ["gh_token"] + sensible_keys = ["gh_token", "gh_app_private_key"] for key in sensible_keys: if value := dictionary.get(key, ""): dictionary[key] = censor_half_string(value) From ef0c4d1586508b0e232ef2047073ef2265c66653 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 3 Dec 2024 09:43:17 +0100 Subject: [PATCH 3/5] split _fetch_collaborators_of_all_organization_repos() into smaller parts --- gh_org_mgr/_gh_org.py | 85 ++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/gh_org_mgr/_gh_org.py b/gh_org_mgr/_gh_org.py index 137c496..adc72e3 100644 --- a/gh_org_mgr/_gh_org.py +++ b/gh_org_mgr/_gh_org.py @@ -983,50 +983,20 @@ def _fetch_collaborators_of_all_organization_repos(self): # Initial query parameters variables = {"owner": self.org.login, "cursor": None} + # dicts in which we store the collaborators of each repo, and the repos + # for which there are more than 100 collaborators repo_collaborators: dict[str, dict] = {} - missing_data_for_repos = {} - more_repos = True + missing_data_for_repos: dict[str, str] = {} + more_repos = True while more_repos: logging.debug("Requesting collaborators for %s", self.org.login) result = run_graphql_query(graphql_query, variables, self.gh_token) - try: - for repo in result["data"]["organization"]["repositories"]["edges"]: - try: - repo_name = repo["node"]["name"] - logging.debug( - "Extracting collaborators for %s from GraphQL response", repo_name - ) - except KeyError: - logging.error( - "Did not find a repo name in the GraphQL response which " - "seems to hint to a bug: %s", - repo, - ) - sys.exit(1) - - # fill in collaborators of repo - collaborators = repo["node"]["collaborators"]["edges"] - repo_collaborators[repo_name] = collaborators - - # Find out if there are more than 100 collaborators in the - # GraphQL response for this repo - if repo["node"]["collaborators"]["pageInfo"]["hasNextPage"]: - missing_data_for_repos[repo_name] = repo["node"]["collaborators"][ - "pageInfo" - ]["endCursor"] - - # Find out if there are more than 100 repos in the GraphQL - # response. If so, get cursor - more_repos = result["data"]["organization"]["repositories"]["pageInfo"][ - "hasNextPage" - ] - variables["cursor"] = result["data"]["organization"]["repositories"]["pageInfo"][ - "endCursor" - ] - except (TypeError, KeyError): - logging.debug("Repo %s does not seem to have any collaborators", repo_name) - continue + self._process_graphql_response(result, repo_collaborators, missing_data_for_repos) + more_repos = result["data"]["organization"]["repositories"]["pageInfo"]["hasNextPage"] + variables["cursor"] = result["data"]["organization"]["repositories"]["pageInfo"][ + "endCursor" + ] if missing_data_for_repos: logging.warning( @@ -1035,8 +1005,41 @@ def _fetch_collaborators_of_all_organization_repos(self): ) # TODO: Need to make individual graphql queries for these repos - # Iterate repos, and fill self.current_repos_collaborators[repo] with - # collaborators as fetched from GraphQL and put into repo_collaborators + self._populate_current_repos_collaborators(repo_collaborators) + + def _process_graphql_response( + self, result, repo_collaborators: dict[str, dict], missing_data_for_repos: dict[str, str] + ): + """Process the GraphQL response and extract collaborators""" + try: + for repo_edges in result["data"]["organization"]["repositories"]["edges"]: + try: + repo_name: str = repo_edges["node"]["name"] + logging.debug( + "Extracting collaborators for %s from GraphQL response", repo_name + ) + except KeyError: + logging.error( + "Did not find a repo name in the GraphQL response which " + "seems to hint to a bug: %s", + repo_edges, + ) + sys.exit(1) + + # fill in collaborators of repo + repo_collaborators[repo_name] = repo_edges["node"]["collaborators"]["edges"] + + # Find out if there are more than 100 collaborators in the + # GraphQL response for this repo + if repo_edges["node"]["collaborators"]["pageInfo"]["hasNextPage"]: + missing_data_for_repos[repo_name] = repo_edges["node"]["collaborators"][ + "pageInfo" + ]["endCursor"] + except (TypeError, KeyError): + logging.debug("Repo does not seem to have any collaborators") + + def _populate_current_repos_collaborators(self, repo_collaborators: dict[str, dict]): + """Populate self.current_repos_collaborators with data from repo_collaborators""" for repo, collaborators in self.current_repos_collaborators.items(): if repo.name in repo_collaborators: # Extract each collaborator from the GraphQL response for this repo From 72bf40019ecccea6a47b99e7388ff27da81f2421 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 3 Dec 2024 11:38:37 +0100 Subject: [PATCH 4/5] add handling of graphql response for org and repo level --- gh_org_mgr/_gh_org.py | 183 +++++++++++++++++++++++++++++++++--------- 1 file changed, 147 insertions(+), 36 deletions(-) diff --git a/gh_org_mgr/_gh_org.py b/gh_org_mgr/_gh_org.py index adc72e3..2e9d078 100644 --- a/gh_org_mgr/_gh_org.py +++ b/gh_org_mgr/_gh_org.py @@ -40,6 +40,7 @@ class GHorg: # pylint: disable=too-many-instance-attributes, too-many-lines configured_teams: dict[str, dict | None] = field(default_factory=dict) newly_added_users: list[NamedUser] = field(default_factory=list) current_repos_teams: dict[Repository, dict[Team, str]] = field(default_factory=dict) + graphql_repos_collaborators: dict[str, list[dict]] = field(default_factory=dict) current_repos_collaborators: dict[Repository, dict[str, str]] = field(default_factory=dict) configured_repos_collaborators: dict[str, dict[str, str]] = field(default_factory=dict) archived_repos: list[Repository] = field(default_factory=list) @@ -946,18 +947,18 @@ def _convert_graphql_perm_to_rest(self, permission: str) -> str: return permission - def _fetch_collaborators_of_all_organization_repos(self): + def _fetch_collaborators_of_all_organization_repos(self) -> None: """Get all collaborators (individuals) of all repos of a GitHub organization with their permissions using the GraphQL API""" graphql_query = """ query($owner: String!, $cursor: String) { organization(login: $owner) { - repositories(first: 100, after: $cursor) { + repositories(first: 5, after: $cursor) { edges { node { name - collaborators(first: 100) { + collaborators(first: 20) { edges { node { login @@ -980,39 +981,117 @@ def _fetch_collaborators_of_all_organization_repos(self): } """ - # Initial query parameters + # Initial query parameters for org-level request variables = {"owner": self.org.login, "cursor": None} - # dicts in which we store the collaborators of each repo, and the repos - # for which there are more than 100 collaborators - repo_collaborators: dict[str, dict] = {} - missing_data_for_repos: dict[str, str] = {} + # dict in which we store repos for which there are more than 100 + # collaborators, and their respective end cursors + next_page_cursors_for_repos: dict[str, str] = {} - more_repos = True - while more_repos: + more_repos_in_org = True + while more_repos_in_org: logging.debug("Requesting collaborators for %s", self.org.login) - result = run_graphql_query(graphql_query, variables, self.gh_token) - self._process_graphql_response(result, repo_collaborators, missing_data_for_repos) - more_repos = result["data"]["organization"]["repositories"]["pageInfo"]["hasNextPage"] - variables["cursor"] = result["data"]["organization"]["repositories"]["pageInfo"][ - "endCursor" - ] + org_result = run_graphql_query(graphql_query, variables, self.gh_token) + more_repos_in_org, variables["cursor"] = self._extract_data_from_graphql_response( + graphql_response=org_result, next_page_cursors_for_repos=next_page_cursors_for_repos + ) - if missing_data_for_repos: - logging.warning( + # If there are more than 100 collaborators in a repo, we need to fetch + # rest via individual GraphQL queries + if next_page_cursors_for_repos: + logging.debug( "Not all collaborators of all repos have been fetched. Missing data: %s", - missing_data_for_repos, + next_page_cursors_for_repos, ) - # TODO: Need to make individual graphql queries for these repos + for repo_name, end_cursor in next_page_cursors_for_repos.items(): + more_collaborators_in_repo = True + while more_collaborators_in_repo: + logging.debug("Requesting additional collaborators for repo %s", repo_name) + # Initial query parameters for repo-level request + repo_variables = { + "owner": self.org.login, + "repo": repo_name, + "cursor": end_cursor, + } + repo_query = """ + query($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + collaborators(first: 100, after: $cursor) { + edges { + node { + login + } + permission + } + pageInfo { + endCursor + hasNextPage + } + } + } + } + """ + repo_result = run_graphql_query(repo_query, repo_variables, self.gh_token) + more_collaborators_in_repo, end_cursor = ( + self._extract_data_from_graphql_response( + graphql_response=repo_result, + next_page_cursors_for_repos=next_page_cursors_for_repos, + single_repo_name=repo_name, + ) + ) - self._populate_current_repos_collaborators(repo_collaborators) + # All collaborators from all repos have been fetched, now populate the + # actual dictionary + self._populate_current_repos_collaborators() - def _process_graphql_response( - self, result, repo_collaborators: dict[str, dict], missing_data_for_repos: dict[str, str] - ): - """Process the GraphQL response and extract collaborators""" - try: - for repo_edges in result["data"]["organization"]["repositories"]["edges"]: + def _extract_data_from_graphql_response( + self, + graphql_response: dict, + next_page_cursors_for_repos: dict[str, str], + single_repo_name: str = "", + ) -> tuple[bool, str]: + """ + Extracts collaborator data from a GraphQL response for either an + organization or a single repository. + + Args: + graphql_response (dict): The GraphQL response containing the data. + next_page_cursors_for_repos (dict[str, str]): A dictionary to store + the next page cursors for repositories. + single_repo_name (str, optional): The name of a single repository to + extract data for. Defaults to "". + + Returns: + tuple[bool, str]: A tuple containing a boolean indicating if there + is a next page and a string for the cursor. + - For organization level extraction: + - bool: Indicates if there is a next page of repositories. + - str: The cursor for the next page of repositories. + - For single repository extraction: + - bool: Indicates if there is a next page of collaborators. + - str: The cursor for the next page of collaborators. + + Raises: + SystemExit: If a repository name is not found in the GraphQL + response at the organization level. + + This method processes the GraphQL response to extract information about + repositories and their collaborators. It handles pagination by + identifying if there are more pages of repositories or collaborators to + be fetched. + """ + if not single_repo_name and "organization" in graphql_response["data"]: + logging.debug("Extracting collaborators for organization from GraphQL response") + + # Initialise returns + org_has_next_page = graphql_response["data"]["organization"]["repositories"][ + "pageInfo" + ]["hasNextPage"] + org_cursor = graphql_response["data"]["organization"]["repositories"]["pageInfo"][ + "endCursor" + ] + + for repo_edges in graphql_response["data"]["organization"]["repositories"]["edges"]: try: repo_name: str = repo_edges["node"]["name"] logging.debug( @@ -1020,30 +1099,62 @@ def _process_graphql_response( ) except KeyError: logging.error( - "Did not find a repo name in the GraphQL response which " - "seems to hint to a bug: %s", + "Did not find a repo name in the GraphQL response " + "(organization level) which seems to hint to a bug: %s", repo_edges, ) sys.exit(1) # fill in collaborators of repo - repo_collaborators[repo_name] = repo_edges["node"]["collaborators"]["edges"] + try: + self.graphql_repos_collaborators[repo_name].extend( + repo_edges["node"]["collaborators"]["edges"] + ) + except (TypeError, KeyError): + logging.debug("Repo %s does not seem to have any collaborators", repo_name) # Find out if there are more than 100 collaborators in the # GraphQL response for this repo if repo_edges["node"]["collaborators"]["pageInfo"]["hasNextPage"]: - missing_data_for_repos[repo_name] = repo_edges["node"]["collaborators"][ + next_page_cursors_for_repos[repo_name] = repo_edges["node"]["collaborators"][ "pageInfo" ]["endCursor"] - except (TypeError, KeyError): - logging.debug("Repo does not seem to have any collaborators") - def _populate_current_repos_collaborators(self, repo_collaborators: dict[str, dict]): + return org_has_next_page, org_cursor + + if single_repo_name and "repository" in graphql_response["data"]: + logging.debug( + "Extracting collaborators for repository %s from GraphQL response", single_repo_name + ) + + # Initialise returns + repo_has_next_page = graphql_response["data"]["repository"]["collaborators"][ + "pageInfo" + ]["hasNextPage"] + repo_cursor = graphql_response["data"]["repository"]["collaborators"]["pageInfo"][ + "endCursor" + ] + + # fill in collaborators of repo + try: + self.graphql_repos_collaborators[single_repo_name].extend( + graphql_response["data"]["repository"]["collaborators"]["edges"] + ) + except (TypeError, KeyError): + logging.debug("Repo %s does not seem to have any collaborators", single_repo_name) + + return repo_has_next_page, repo_cursor + + logging.warning("No relevant data found in GraphQL response") + logging.debug("GraphQL response: %s", graphql_response) + return False, "" + + def _populate_current_repos_collaborators(self) -> None: """Populate self.current_repos_collaborators with data from repo_collaborators""" for repo, collaborators in self.current_repos_collaborators.items(): - if repo.name in repo_collaborators: + if repo.name in self.graphql_repos_collaborators: # Extract each collaborator from the GraphQL response for this repo - for collaborator in repo_collaborators[repo.name]: + for collaborator in self.graphql_repos_collaborators[repo.name]: login: str = collaborator["node"]["login"] # Skip entry if collaborator is org owner, which is "admin" anyway if login.lower() in [user.login.lower() for user in self.current_org_owners]: From 83ea3a745dbde0261a6f4f114d52862c2bab9f02 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 3 Dec 2024 12:26:49 +0100 Subject: [PATCH 5/5] fix initialisation of graphql_repos_collaborators[repo_name] with first list of collaborators --- gh_org_mgr/_gh_org.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gh_org_mgr/_gh_org.py b/gh_org_mgr/_gh_org.py index 2e9d078..f65f3e3 100644 --- a/gh_org_mgr/_gh_org.py +++ b/gh_org_mgr/_gh_org.py @@ -954,11 +954,11 @@ def _fetch_collaborators_of_all_organization_repos(self) -> None: graphql_query = """ query($owner: String!, $cursor: String) { organization(login: $owner) { - repositories(first: 5, after: $cursor) { + repositories(first: 100, after: $cursor) { edges { node { name - collaborators(first: 20) { + collaborators(first: 100) { edges { node { login @@ -1107,9 +1107,8 @@ def _extract_data_from_graphql_response( # fill in collaborators of repo try: - self.graphql_repos_collaborators[repo_name].extend( - repo_edges["node"]["collaborators"]["edges"] - ) + repo_collaborators = repo_edges["node"]["collaborators"]["edges"] + self.graphql_repos_collaborators[repo_name] = repo_collaborators except (TypeError, KeyError): logging.debug("Repo %s does not seem to have any collaborators", repo_name) @@ -1137,9 +1136,10 @@ def _extract_data_from_graphql_response( # fill in collaborators of repo try: - self.graphql_repos_collaborators[single_repo_name].extend( - graphql_response["data"]["repository"]["collaborators"]["edges"] - ) + repo_collaborators = graphql_response["data"]["repository"]["collaborators"][ + "edges" + ] + self.graphql_repos_collaborators[single_repo_name].extend(repo_collaborators) except (TypeError, KeyError): logging.debug("Repo %s does not seem to have any collaborators", single_repo_name)