diff --git a/README.md b/README.md index 0d85aa92..8e7f625c 100644 --- a/README.md +++ b/README.md @@ -274,6 +274,7 @@ And you will be asked to provide the following: - A **Bitbucket** authentication token [**optional**], used for Bitbucket Cloud. Create an API token with scopes at `https://bitbucket.org/account/settings/api-tokens/` (permissions: `read:repository:bitbucket`, `read:account`). You will also need to provide your Atlassian account email, as Bitbucket API tokens use Basic authentication (`email:token` encoded in base64). Without a token you are limited to 60 requests/hour. - The path to the trained classifiers (pickle files). If you have your own classifiers, you can provide them here. Otherwise, you can leave it blank. +- A download size limit in MB [**optional, default 200**]. SOMEF skips repository archives larger than this limit. Increase it if you need to process large repositories. You can also override it with the `--download-limit` parameter in the `describe` command. If you want SOMEF to be automatically configured (without any tokens and using the default classifiers) just type: @@ -365,6 +366,9 @@ Options: from certain files like CODEOWNERS. This may require extra API requests and increase execution time + --download-limit INTEGER Download size limit in MB for repository + archives. Overrides the value set in the + configuration file. -h, --help Show this message and exit. diff --git a/docs/usage.md b/docs/usage.md index c5b06245..fa68fbbc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -64,6 +64,9 @@ Options: where files are stored for analysis. Files will be stored at the desired path + --download-limit INTEGER Download size limit in MB for repository + archives. Overrides the value set in the + configuration file. -all, --requirements_all Export all detected requirements, including text and libraries (default). @@ -155,4 +158,27 @@ To change it, edit your `~/.somef/config.json`: Note: This parameter is different from the `-t` threshold used in `somef describe`, which controls the confidence of the supervised classifiers. + +### Download size limit + +Controls the maximum size (in MB) of repository archives that SOMEF will download. +Repositories larger than this limit are skipped. + +- **Default value**: `200` + +To change it, run `somef configure` and enter the desired value when prompted, +or edit your `~/.somef/config.json`: + +```json +{ + "download_limit_mb": 500 +} +``` +You can also override it per command with the --download-limit option: + + +```bash +somef describe -r https://github.com/owner/repo --download-limit 1000 -o output.json +``` + To see a live usage example, try our Binder Notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb) \ No newline at end of file diff --git a/src/somef/__main__.py b/src/somef/__main__.py index b62c2c51..3cbe8681 100644 --- a/src/somef/__main__.py +++ b/src/somef/__main__.py @@ -48,8 +48,11 @@ def configure(auto, base_uri): installation = click.prompt("Installation classifier model file", default=configuration.default_installation) citation = click.prompt("Citation classifier model file", default=configuration.default_citation) base_uri = click.prompt("Base URI for RDF generation", default=base_uri) + download_limit = click.prompt("Download size limit in MB", + default=constants.SIZE_DOWNLOAD_LIMIT_MB, type=int) # configuration.configure() - configuration.configure(github_authorization, gitlab_authorization, codeberg_authorization, bitbucket_authorization, bitbucket_email, description, invocation, installation, citation, base_uri) + configuration.configure(github_authorization, gitlab_authorization, codeberg_authorization, bitbucket_authorization, bitbucket_email, description, invocation, installation, citation, base_uri, download_limit_mb= download_limit) + click.secho(f"Success", fg="green") @@ -238,6 +241,12 @@ def configure(auto, base_uri): default=None, help="Bitbucket Atlassian account email (required with --bitbucket-token)" ) +@click.option( + "--download-limit", + type=int, + default=None, + help="Download size limit in MB (overrides config file value)" +) def describe(requirements_v, requirements_all, **kwargs): # import so missing packages get installed when appropriate diff --git a/src/somef/configuration.py b/src/somef/configuration.py index ab71f99c..f7083b4f 100644 --- a/src/somef/configuration.py +++ b/src/somef/configuration.py @@ -29,6 +29,8 @@ def get_configuration_file(): file_paths = json.load(fh) if constants.CONF_SIMILARITY_THRESHOLD not in file_paths: file_paths[constants.CONF_SIMILARITY_THRESHOLD] = constants.CONF_DEFAULT_SIMILARITY_THRESHOLD + if constants.CONF_DOWNLOAD_LIMIT_MB not in file_paths: + file_paths[constants.CONF_DOWNLOAD_LIMIT_MB] = constants.SIZE_DOWNLOAD_LIMIT_MB else: sys.exit("Error: Please provide a config.json file or run somef configure.") return file_paths @@ -63,8 +65,9 @@ def configure( installation=default_installation, citation=default_citation, base_uri=constants.CONF_DEFAULT_BASE_URI, - similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD): - + similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD, + download_limit_mb=constants.SIZE_DOWNLOAD_LIMIT_MB): + """ Function to configure the main program""" import nltk nltk.download('wordnet') @@ -89,7 +92,8 @@ def configure( constants.CONF_INSTALLATION: installation, constants.CONF_CITATION: citation, constants.CONF_BASE_URI: base_uri, - constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold + constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold, + constants.CONF_DOWNLOAD_LIMIT_MB: download_limit_mb } # if data[constants.CONF_AUTHORIZATION] == "token ": diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 7adcd4e1..d77347f6 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -39,11 +39,19 @@ def is_gitlab(gitlab_server): return False # the same as requests.get(args).json(), but protects against rate limiting -def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=constants.SIZE_DOWNLOAD_LIMIT_MB, **kwargs): +def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=None, **kwargs): # def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs): """Function to obtain how many requests we have pending with the repository API""" """GET request that handles rate limiting and prevents downloading excessively large files""" + + if size_limit_mb is None: + try: + config = configuration.get_configuration_file() + size_limit_mb = config.get(constants.CONF_DOWNLOAD_LIMIT_MB, constants.SIZE_DOWNLOAD_LIMIT_MB) + except Exception: + size_limit_mb = constants.SIZE_DOWNLOAD_LIMIT_MB + size_limit_bytes = size_limit_mb * 1024 * 1024 url = args[0] if args else kwargs.get("url") if not url: @@ -428,7 +436,7 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url, autho -def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, authorization=None): +def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, authorization=None, download_limit=None): """ Download all repository files from a GitLab repository Parameters @@ -458,7 +466,10 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, au ) logging.info(f"Downloading {repo_archive_url}") - repo_download = requests.get(repo_archive_url, headers=gitlab_header_template(authorization)) + repo_download, _ = rate_limit_get(repo_archive_url, headers=gitlab_header_template(authorization), size_limit_mb=download_limit) + if repo_download is None: + logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB") + return None repo_zip = repo_download.content repo_zip_file = os.path.join(directory, "repo.zip") @@ -812,7 +823,7 @@ def do_crosswalk(data, crosswalk_table): def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, - authorization=None): + authorization=None, download_limit=None): """ Given a repository, this method will download its files and return the readme text Parameters @@ -832,19 +843,19 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe """ if repo_type == constants.RepositoryType.GITHUB: - return download_github_files(target_dir, owner, repo_name, default_branch, authorization) + return download_github_files(target_dir, owner, repo_name, default_branch, authorization, download_limit) elif repo_type == constants.RepositoryType.GITLAB: - return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref, authorization) + return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref, authorization,download_limit) elif repo_type == constants.RepositoryType.CODEBERG: - return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization) + return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization, download_limit) elif repo_type == constants.RepositoryType.BITBUCKET: - return download_bitbucket_files(target_dir, owner, repo_name, default_branch, authorization) + return download_bitbucket_files(target_dir, owner, repo_name, default_branch, authorization, download_limit) else: logging.error("Cannot download files from a local repository!") return None -def download_github_files(directory, owner, repo_name, repo_ref, authorization): +def download_github_files(directory, owner, repo_name, repo_ref, authorization, download_limit=None): """ Download all repository files from a GitHub repository. @@ -882,6 +893,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): # works for the vast majority of repos and avoids an extra HTTP round-trip. When # that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the # fully-qualified refs/heads/ and refs/tags/ forms before falling back to main. + candidate_urls = [ f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip", f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip", @@ -892,7 +904,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): repo_archive_url = None for repo_archive_url in candidate_urls: logging.info(f"Downloading {repo_archive_url}") - repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) + repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization),size_limit_mb=download_limit) if repo_download is None: # Size limit exceeded or streaming error — no point trying other URLs logging.warning( @@ -1250,7 +1262,7 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url, aut return repo_metadata, owner, repo_name, default_branch, "/".join(path_components) -def download_codeberg_files(directory, owner, repo_name, repo_branch,authorization=None): +def download_codeberg_files(directory, owner, repo_name, repo_branch, authorization=None, download_limit=None): """ Download all repository files from a Codeberg repository. """ @@ -1259,7 +1271,10 @@ def download_codeberg_files(directory, owner, repo_name, repo_branch,authorizati headers = codeberg_header_template(authorization) - repo_download, _ = rate_limit_get(repo_archive_url, headers=headers) + repo_download, _ = rate_limit_get(repo_archive_url, headers=headers, size_limit_mb=download_limit) + if repo_download is None: + logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length") + return None if repo_download.status_code != 200: logging.error(f"Error downloading Codeberg archive: HTTP {repo_download.status_code}") return None @@ -1419,12 +1434,12 @@ def load_bitbucket_repository_metadata(repo_metadata: Result, repository_url, au return repo_metadata, owner, repo_name, default_branch, "/".join(path_components) -def download_bitbucket_files(directory, owner, repo_name, repo_branch, authorization=None): +def download_bitbucket_files(directory, owner, repo_name, repo_branch, authorization=None,download_limit=None): repo_archive_url = f"https://bitbucket.org/{owner}/{repo_name}/get/{repo_branch}.zip" logging.info(f"Downloading {repo_archive_url}") headers = bitbucket_header_template(authorization) - repo_download, _ = rate_limit_get(repo_archive_url, headers=headers) + repo_download, _ = rate_limit_get(repo_archive_url, headers=headers, size_limit_mb=download_limit) if repo_download is None: logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length") return None diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 7daadc0c..9ba9b2b2 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -23,7 +23,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None, ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None, - ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None) -> Result: + ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None, download_limit= None) -> Result: """ Main function to get the data through the command line Parameters @@ -121,7 +121,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc elif keep_tmp is not None: # save downloaded files locally os.makedirs(keep_tmp, exist_ok=True) local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type, - keep_tmp, repo_url, authorization) + keep_tmp, repo_url, authorization, download_limit) if local_folder is not None: readme_text, full_repository_metadata = process_files.process_repository_files(local_folder, repository_metadata, @@ -139,7 +139,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc with tempfile.TemporaryDirectory() as temp_dir: local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type, - temp_dir, repo_url, authorization) + temp_dir, repo_url, authorization, download_limit) if local_folder is not None: readme_text, full_repository_metadata = process_files.process_repository_files(local_folder, repository_metadata, @@ -282,7 +282,8 @@ def run_cli(*, gitlab_token=None, codeberg_token=None, bitbucket_token=None, - bitbucket_email=None + bitbucket_email=None, + download_limit=None ): """Function to run all the required components of the cli for a repository""" # check if it is a valid url @@ -318,8 +319,9 @@ def run_cli(*, encoded_url = encoded_url.replace(".","") #removing dots just in case repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, - keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder, + requirements_mode=requirements_mode, reconcile_authors=reconcile_authors, + branch=branch, tag=tag, download_limit=download_limit) if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() @@ -355,15 +357,15 @@ def run_cli(*, repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, download_limit=download_limit) elif local_repo: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, download_limit=download_limit) else: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, download_limit= download_limit) if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 7a4e366b..ca370d16 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -5,9 +5,9 @@ import json import zipfile from pathlib import Path -from unittest.mock import MagicMock, patch, call +from unittest.mock import MagicMock, patch, call, mock_open from ..parser import pom_xml_parser -from .. import process_repository, process_files, somef_cli +from .. import process_repository, process_files, somef_cli, configuration from ..utils import constants from ..process_results import Result @@ -380,7 +380,7 @@ def test_http_300_falls_back_to_refs_heads(self, mock_rlg): (_make_mock_response(200, zip_bytes), ""), # refs/heads/v2.0.zip → 200 ] with tempfile.TemporaryDirectory() as tmp: - result = process_repository.download_github_files(tmp, "balaje", "icefem", "v2.0", None) + result = process_repository.download_github_files(tmp, "balaje", "icefem", "v2.0", None, None) self.assertIsNotNone(result, "Should succeed via refs/heads/ fallback") urls_tried = [c[0][0] for c in mock_rlg.call_args_list] @@ -399,7 +399,7 @@ def test_http_300_falls_back_to_refs_tags(self, mock_rlg): (_make_mock_response(200, zip_bytes), ""), # refs/tags/ → 200 ] with tempfile.TemporaryDirectory() as tmp: - result = process_repository.download_github_files(tmp, "owner", "repo", "v1.0", None) + result = process_repository.download_github_files(tmp, "owner", "repo", "v1.0", None, None) self.assertIsNotNone(result, "Should succeed via refs/tags/ fallback") urls_tried = [c[0][0] for c in mock_rlg.call_args_list] @@ -418,7 +418,7 @@ def test_http_404_falls_back_to_main(self, mock_rlg): (_make_mock_response(200, zip_bytes), ""), # main.zip → 200 ] with tempfile.TemporaryDirectory() as tmp: - result = process_repository.download_github_files(tmp, "owner", "repo", "oldmaster", None) + result = process_repository.download_github_files(tmp, "owner", "repo", "oldmaster", None, None) self.assertIsNotNone(result, "Should succeed via main.zip fallback") urls_tried = [c[0][0] for c in mock_rlg.call_args_list] @@ -432,7 +432,7 @@ def test_all_fallbacks_fail_returns_none_not_exit(self, mock_rlg): """ mock_rlg.return_value = (_make_mock_response(404), "") with tempfile.TemporaryDirectory() as tmp: - result = process_repository.download_github_files(tmp, "owner", "repo", "branch", None) + result = process_repository.download_github_files(tmp, "owner", "repo", "branch", None, None) self.assertIsNone(result) # All four candidates should have been attempted @@ -447,12 +447,14 @@ def test_size_limit_stops_loop_immediately(self, mock_rlg): """ mock_rlg.return_value = (None, None) with tempfile.TemporaryDirectory() as tmp: - result = process_repository.download_github_files(tmp, "owner", "repo", "main", None) + result = process_repository.download_github_files(tmp, "owner", "repo", "main", None, None) self.assertIsNone(result) self.assertEqual(mock_rlg.call_count, 1, "Should stop after first None response") + + class TestRateLimitGetHeadRequest(unittest.TestCase): """ Tests for the socket-leak fix in rate_limit_get (issue #909 follow-up). @@ -489,11 +491,13 @@ def test_head_used_instead_of_streaming_get(self, mock_head, mock_get): @patch("somef.process_repository.requests.get") @patch("somef.process_repository.requests.head") - def test_head_response_closed_on_size_exceeded(self, mock_head, mock_get): + @patch("somef.process_repository.configuration.get_configuration_file") + def test_head_response_closed_on_size_exceeded(self, mock_config, mock_head, mock_get): """ The HEAD response must be closed even when the size check triggers an early return — otherwise the connection stays open in the pool indefinitely. """ + mock_config.return_value = {constants.CONF_DOWNLOAD_LIMIT_MB: constants.SIZE_DOWNLOAD_LIMIT_MB} oversized = (constants.SIZE_DOWNLOAD_LIMIT_MB + 1) * 1024 * 1024 head_resp = MagicMock() head_resp.headers = {"Content-Length": str(oversized)} @@ -506,4 +510,70 @@ def test_head_response_closed_on_size_exceeded(self, mock_head, mock_get): self.assertIsNone(result) head_resp.close.assert_called_once() - mock_get.assert_not_called() # full GET should never be issued \ No newline at end of file + mock_get.assert_not_called() # full GET should never be issued + + + @patch("somef.process_repository.requests.get") + @patch("somef.process_repository.requests.head") + @patch("somef.process_repository.configuration.get_configuration_file") + def test_rate_limit_get_reads_limit_from_config(self, mock_config, mock_head, mock_get): + """When size_limit_mb=None, rate_limit_get must read from config file.""" + mock_config.return_value = {"download_limit_mb": 500} + head_resp = MagicMock() + head_resp.headers = {"Content-Length": str(300 * 1024 * 1024)} # 300 MB + head_resp.close = MagicMock() + mock_head.return_value = head_resp + + result, _ = process_repository.rate_limit_get( + "https://github.com/owner/repo/archive/main.zip" + ) + + # 300 MB > 200 should warning but we have set 500 + mock_get.assert_called_once() + + + @patch("somef.configuration.json.dump") + @patch("somef.configuration.Path") + @patch("somef.configuration.os.makedirs") + @patch("nltk.download") + def test_configure_saves_download_limit(self, mock_nltk, mock_makedirs, mock_path_cls, mock_json_dump): + """ + Verifies that configure(download_limit_mb=X) persists the value to the configuration file. + Mocks json.dump at the configuration module level and checks that the written JSON + contains {"download_limit_mb": 500}. + """ + configuration.configure(download_limit_mb=500) + args, _ = mock_json_dump.call_args + assert args[0]["download_limit_mb"] == 500 + + + @patch("somef.configuration.json.load", return_value={"download_limit_mb": 500}) + @patch("somef.configuration.Path") + def test_get_configuration_file_returns_download_limit(self, mock_path_cls, mock_json_load): + """ + Verifies that get_configuration_file() reads the download_limit_mb key from the config file. + Mocks Path and json.load to return a config with download_limit_mb=500, + and asserts the returned dict contains that value. + """ + instance = mock_path_cls.return_value + instance.expanduser.return_value = instance + instance.exists.return_value = True + + config = configuration.get_configuration_file() + assert config["download_limit_mb"] == 500 + + + @patch("somef.process_repository.download_github_files") + def test_download_repository_files_propagates_limit(self, mock_dl): + """ + Verifies that download_limit is propagated from download_repository_files + to download_github_files. Mocks download_github_files, calls + download_repository_files with download_limit=500, and asserts that + download_github_files receives 500 as its last argument. + """ + with tempfile.TemporaryDirectory() as tmp: + process_repository.download_repository_files( + "owner", "repo", "main", constants.RepositoryType.GITHUB, + tmp, None, None, 500 + ) + mock_dl.assert_called_once_with(tmp, "owner", "repo", "main", None, 500) diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 1b73f1dd..4d47eeb1 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -18,6 +18,7 @@ CONF_CITATION = "citation" CONF_BASE_URI = "base_uri" CONF_DEFAULT_BASE_URI = "https://w3id.org/okn/i/" +CONF_DOWNLOAD_LIMIT_MB = "download_limit_mb" __DEFAULT_SOMEF_CONFIGURATION_FILE__ = "~/.somef/config.json"