Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ And you will be asked to provide the following:
- A **Bitbucket** authentication token [**optional**], used for Bitbucket Cloud. Create an API token with scopes at `https://bitbucket.org/account/settings/api-tokens/` (permissions: `read:repository:bitbucket`, `read:account`). You will also need to provide your Atlassian account email, as Bitbucket API tokens use Basic authentication (`email:token` encoded in base64). Without a token you are limited to 60 requests/hour.
- The path to the trained classifiers (pickle files). If you have your own classifiers, you can provide them here. Otherwise, you can leave it blank.

- A download size limit in MB [**optional, default 200**]. SOMEF skips repository archives larger than this limit. Increase it if you need to process large repositories. You can also override it with the `--download-limit` parameter in the `describe` command.

If you want SOMEF to be automatically configured (without any tokens and using the default classifiers) just type:

Expand Down Expand Up @@ -365,6 +366,9 @@ Options:
from certain files like CODEOWNERS.
This may require extra API
requests and increase execution time
--download-limit INTEGER Download size limit in MB for repository
archives. Overrides the value set in the
configuration file.

-h, --help Show this message and exit.

Expand Down
26 changes: 26 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ Options:
where files are stored for analysis. Files
will be stored at the
desired path
--download-limit INTEGER Download size limit in MB for repository
archives. Overrides the value set in the
configuration file.

-all, --requirements_all Export all detected requirements, including
text and libraries (default).
Expand Down Expand Up @@ -155,4 +158,27 @@ To change it, edit your `~/.somef/config.json`:
Note: This parameter is different from the `-t` threshold used in `somef describe`,
which controls the confidence of the supervised classifiers.


### Download size limit

Controls the maximum size (in MB) of repository archives that SOMEF will download.
Repositories larger than this limit are skipped.

- **Default value**: `200`

To change it, run `somef configure` and enter the desired value when prompted,
or edit your `~/.somef/config.json`:

```json
{
"download_limit_mb": 500
}
```
You can also override it per command with the --download-limit option:


```bash
somef describe -r https://github.com/owner/repo --download-limit 1000 -o output.json
```

To see a live usage example, try our Binder Notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)
11 changes: 10 additions & 1 deletion src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@ def configure(auto, base_uri):
installation = click.prompt("Installation classifier model file", default=configuration.default_installation)
citation = click.prompt("Citation classifier model file", default=configuration.default_citation)
base_uri = click.prompt("Base URI for RDF generation", default=base_uri)
download_limit = click.prompt("Download size limit in MB",
default=constants.SIZE_DOWNLOAD_LIMIT_MB, type=int)
# configuration.configure()
configuration.configure(github_authorization, gitlab_authorization, codeberg_authorization, bitbucket_authorization, bitbucket_email, description, invocation, installation, citation, base_uri)
configuration.configure(github_authorization, gitlab_authorization, codeberg_authorization, bitbucket_authorization, bitbucket_email, description, invocation, installation, citation, base_uri, download_limit_mb= download_limit)

click.secho(f"Success", fg="green")


Expand Down Expand Up @@ -238,6 +241,12 @@ def configure(auto, base_uri):
default=None,
help="Bitbucket Atlassian account email (required with --bitbucket-token)"
)
@click.option(
"--download-limit",
type=int,
default=None,
help="Download size limit in MB (overrides config file value)"
)

def describe(requirements_v, requirements_all, **kwargs):
# import so missing packages get installed when appropriate
Expand Down
10 changes: 7 additions & 3 deletions src/somef/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def get_configuration_file():
file_paths = json.load(fh)
if constants.CONF_SIMILARITY_THRESHOLD not in file_paths:
file_paths[constants.CONF_SIMILARITY_THRESHOLD] = constants.CONF_DEFAULT_SIMILARITY_THRESHOLD
if constants.CONF_DOWNLOAD_LIMIT_MB not in file_paths:
file_paths[constants.CONF_DOWNLOAD_LIMIT_MB] = constants.SIZE_DOWNLOAD_LIMIT_MB
else:
sys.exit("Error: Please provide a config.json file or run somef configure.")
return file_paths
Expand Down Expand Up @@ -63,8 +65,9 @@ def configure(
installation=default_installation,
citation=default_citation,
base_uri=constants.CONF_DEFAULT_BASE_URI,
similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):

similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD,
download_limit_mb=constants.SIZE_DOWNLOAD_LIMIT_MB):

""" Function to configure the main program"""
import nltk
nltk.download('wordnet')
Expand All @@ -89,7 +92,8 @@ def configure(
constants.CONF_INSTALLATION: installation,
constants.CONF_CITATION: citation,
constants.CONF_BASE_URI: base_uri,
constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold
constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold,
constants.CONF_DOWNLOAD_LIMIT_MB: download_limit_mb
}

# if data[constants.CONF_AUTHORIZATION] == "token ":
Expand Down
43 changes: 29 additions & 14 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,19 @@ def is_gitlab(gitlab_server):
return False

# the same as requests.get(args).json(), but protects against rate limiting
def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=constants.SIZE_DOWNLOAD_LIMIT_MB, **kwargs):
def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=None, **kwargs):
# def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
"""Function to obtain how many requests we have pending with the repository API"""

"""GET request that handles rate limiting and prevents downloading excessively large files"""

if size_limit_mb is None:
try:
config = configuration.get_configuration_file()
size_limit_mb = config.get(constants.CONF_DOWNLOAD_LIMIT_MB, constants.SIZE_DOWNLOAD_LIMIT_MB)
except Exception:
size_limit_mb = constants.SIZE_DOWNLOAD_LIMIT_MB

size_limit_bytes = size_limit_mb * 1024 * 1024
url = args[0] if args else kwargs.get("url")
if not url:
Expand Down Expand Up @@ -428,7 +436,7 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url, autho



def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, authorization=None):
def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, authorization=None, download_limit=None):
"""
Download all repository files from a GitLab repository
Parameters
Expand Down Expand Up @@ -458,7 +466,10 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref, au
)

logging.info(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url, headers=gitlab_header_template(authorization))
repo_download, _ = rate_limit_get(repo_archive_url, headers=gitlab_header_template(authorization), size_limit_mb=download_limit)
if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB")
return None
repo_zip = repo_download.content

repo_zip_file = os.path.join(directory, "repo.zip")
Expand Down Expand Up @@ -812,7 +823,7 @@ def do_crosswalk(data, crosswalk_table):


def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None,
authorization=None):
authorization=None, download_limit=None):
"""
Given a repository, this method will download its files and return the readme text
Parameters
Expand All @@ -832,19 +843,19 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe
"""

if repo_type == constants.RepositoryType.GITHUB:
return download_github_files(target_dir, owner, repo_name, default_branch, authorization)
return download_github_files(target_dir, owner, repo_name, default_branch, authorization, download_limit)
elif repo_type == constants.RepositoryType.GITLAB:
return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref, authorization)
return download_gitlab_files(target_dir, owner, repo_name, default_branch, repo_ref, authorization,download_limit)
elif repo_type == constants.RepositoryType.CODEBERG:
return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization)
return download_codeberg_files(target_dir, owner, repo_name, default_branch, authorization, download_limit)
elif repo_type == constants.RepositoryType.BITBUCKET:
return download_bitbucket_files(target_dir, owner, repo_name, default_branch, authorization)
return download_bitbucket_files(target_dir, owner, repo_name, default_branch, authorization, download_limit)
else:
logging.error("Cannot download files from a local repository!")
return None


def download_github_files(directory, owner, repo_name, repo_ref, authorization):
def download_github_files(directory, owner, repo_name, repo_ref, authorization, download_limit=None):
"""
Download all repository files from a GitHub repository.

Expand Down Expand Up @@ -882,6 +893,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
# works for the vast majority of repos and avoids an extra HTTP round-trip. When
# that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the
# fully-qualified refs/heads/ and refs/tags/ forms before falling back to main.

candidate_urls = [
f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip",
f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip",
Expand All @@ -892,7 +904,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
repo_archive_url = None
for repo_archive_url in candidate_urls:
logging.info(f"Downloading {repo_archive_url}")
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization),size_limit_mb=download_limit)
if repo_download is None:
# Size limit exceeded or streaming error — no point trying other URLs
logging.warning(
Expand Down Expand Up @@ -1250,7 +1262,7 @@ def load_codeberg_repository_metadata(repo_metadata: Result, repository_url, aut
return repo_metadata, owner, repo_name, default_branch, "/".join(path_components)


def download_codeberg_files(directory, owner, repo_name, repo_branch,authorization=None):
def download_codeberg_files(directory, owner, repo_name, repo_branch, authorization=None, download_limit=None):
"""
Download all repository files from a Codeberg repository.
"""
Expand All @@ -1259,7 +1271,10 @@ def download_codeberg_files(directory, owner, repo_name, repo_branch,authorizati

headers = codeberg_header_template(authorization)

repo_download, _ = rate_limit_get(repo_archive_url, headers=headers)
repo_download, _ = rate_limit_get(repo_archive_url, headers=headers, size_limit_mb=download_limit)
if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length")
return None
if repo_download.status_code != 200:
logging.error(f"Error downloading Codeberg archive: HTTP {repo_download.status_code}")
return None
Expand Down Expand Up @@ -1419,12 +1434,12 @@ def load_bitbucket_repository_metadata(repo_metadata: Result, repository_url, au
return repo_metadata, owner, repo_name, default_branch, "/".join(path_components)


def download_bitbucket_files(directory, owner, repo_name, repo_branch, authorization=None):
def download_bitbucket_files(directory, owner, repo_name, repo_branch, authorization=None,download_limit=None):
repo_archive_url = f"https://bitbucket.org/{owner}/{repo_name}/get/{repo_branch}.zip"
logging.info(f"Downloading {repo_archive_url}")

headers = bitbucket_header_template(authorization)
repo_download, _ = rate_limit_get(repo_archive_url, headers=headers)
repo_download, _ = rate_limit_get(repo_archive_url, headers=headers, size_limit_mb=download_limit)
if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length")
return None
Expand Down
20 changes: 11 additions & 9 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None,
ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None,
ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None) -> Result:
ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None, download_limit= None) -> Result:
"""
Main function to get the data through the command line
Parameters
Expand Down Expand Up @@ -121,7 +121,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
elif keep_tmp is not None: # save downloaded files locally
os.makedirs(keep_tmp, exist_ok=True)
local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type,
keep_tmp, repo_url, authorization)
keep_tmp, repo_url, authorization, download_limit)
if local_folder is not None:
readme_text, full_repository_metadata = process_files.process_repository_files(local_folder,
repository_metadata,
Expand All @@ -139,7 +139,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc

with tempfile.TemporaryDirectory() as temp_dir:
local_folder = process_repository.download_repository_files(owner, repo_name, def_branch, repo_type,
temp_dir, repo_url, authorization)
temp_dir, repo_url, authorization, download_limit)
if local_folder is not None:
readme_text, full_repository_metadata = process_files.process_repository_files(local_folder,
repository_metadata,
Expand Down Expand Up @@ -282,7 +282,8 @@ def run_cli(*,
gitlab_token=None,
codeberg_token=None,
bitbucket_token=None,
bitbucket_email=None
bitbucket_email=None,
download_limit=None
):
"""Function to run all the required components of the cli for a repository"""
# check if it is a valid url
Expand Down Expand Up @@ -318,8 +319,9 @@ def run_cli(*,
encoded_url = encoded_url.replace(".","") #removing dots just in case
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only,
keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder,
requirements_mode=requirements_mode, reconcile_authors=reconcile_authors,
branch=branch, tag=tag, download_limit=download_limit)

if hasattr(repo_data, "get_json"):
repo_data = repo_data.get_json()
Expand Down Expand Up @@ -355,15 +357,15 @@ def run_cli(*,
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only,
keep_tmp=keep_tmp, authorization=authorization, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
branch=branch, tag=tag, download_limit=download_limit)
elif local_repo:
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers,
local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
branch=branch, tag=tag, download_limit=download_limit)
else:
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers,
doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
branch=branch, tag=tag, download_limit= download_limit)

if hasattr(repo_data, "get_json"):
repo_data = repo_data.get_json()
Expand Down
Loading
Loading