diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d4a04a6b..3a25017e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,7 +10,7 @@ jobs: strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9', '3.10'] services: postgres: @@ -62,14 +62,6 @@ jobs: run: | python setup.py install --user - - name: Download and link credentialdigger's models - env: - path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz - snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz - run: | - python -m credentialdigger download path_model - python -m credentialdigger download snippet_model - - name: Run unit tests run: | pytest tests/unit_tests @@ -85,7 +77,5 @@ jobs: POSTGRES_DB: credential_digger_tests DBHOST: localhost DBPORT: 5432 - path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz - snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz run: | pytest tests/functional_tests diff --git a/credentialdigger/client.py b/credentialdigger/client.py index e107b282..7e6770f9 100644 --- a/credentialdigger/client.py +++ b/credentialdigger/client.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone import yaml +from git import GitCommandError from github import Github from rich.progress import Progress @@ -982,11 +983,16 @@ def scan_user(self, username, category=None, models=None, debug=False, # Get repo clone url without .git at the end repo_url = repo.clone_url[:-4] logger.info(f'{i}/{repos_num}) Scanning {repo.url}') - missing_ids[repo_url] = self._scan(repo_url, scanner, - models=models, - debug=debug, - similarity=similarity, - git_token=git_token) + try: + missing_ids[repo_url] = self._scan(repo_url, scanner, + models=models, + debug=debug, + similarity=similarity, + git_token=git_token) + except GitCommandError: + logger.warning(f'{i}/{repos_num} Ignore {repo_url} ' + '(it can not be cloned)') + return missing_ids def scan_wiki(self, repo_url, category=None, models=None, debug=False, @@ -1366,7 +1372,8 @@ def update_similar_snippets(self, # Compute similarity of target_embedding and embedding similarity = compute_similarity(target_embedding, embedding) - if similarity > threshold: - self.update_discovery(d['id'], state) + # Increase counter if similar and the update is successful + if (similarity > threshold and + self.update_discovery(d['id'], state)): n_updated_snippets += 1 return n_updated_snippets diff --git a/credentialdigger/client_postgres.py b/credentialdigger/client_postgres.py index 3b83d9b7..d88ecd95 100644 --- a/credentialdigger/client_postgres.py +++ b/credentialdigger/client_postgres.py @@ -508,7 +508,7 @@ def update_repo(self, url, last_scan): bool `True` if the update is successful, `False` otherwise """ - super().update_repo( + return super().update_repo( url=url, last_scan=last_scan, query='UPDATE repos SET last_scan=%s WHERE url=%s RETURNING true' ) @@ -528,7 +528,7 @@ def update_discovery(self, discovery_id, new_state): bool `True` if the update is successful, `False` otherwise """ - super().update_discovery( + return super().update_discovery( discovery_id=discovery_id, new_state=new_state, query='UPDATE discoveries SET state=%s WHERE id=%s RETURNING true') @@ -548,7 +548,7 @@ def update_discoveries(self, discoveries_ids, new_state): bool `True` if the update is successful, `False` otherwise """ - super().update_discoveries( + return super().update_discoveries( discoveries_ids=discoveries_ids, new_state=new_state, query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true') @@ -581,6 +581,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None): if snippet is not None: query += ' and snippet=%s' query += ' RETURNING true' - super().update_discovery_group( + return super().update_discovery_group( new_state=new_state, repo_url=repo_url, file_name=file_name, snippet=snippet, query=query) diff --git a/credentialdigger/client_sqlite.py b/credentialdigger/client_sqlite.py index 691da16c..4035c72a 100644 --- a/credentialdigger/client_sqlite.py +++ b/credentialdigger/client_sqlite.py @@ -67,7 +67,7 @@ def query_check(self, query, *args): try: cursor.execute(query, args) self.db.commit() - return cursor.rowcount < 1 + return cursor.rowcount >= 1 except (TypeError, IndexError): """ A TypeError is raised if any of the required arguments is missing. """ @@ -544,7 +544,7 @@ def update_repo(self, url, last_scan): bool `True` if the update is successful, `False` otherwise """ - super().update_repo( + return super().update_repo( url=url, last_scan=last_scan, query='UPDATE repos SET last_scan=? WHERE url=?' ) @@ -564,7 +564,7 @@ def update_discovery(self, discovery_id, new_state): bool `True` if the update is successful, `False` otherwise """ - super().update_discovery( + return super().update_discovery( new_state=new_state, discovery_id=discovery_id, query='UPDATE discoveries SET state=? WHERE id=?' ) @@ -584,7 +584,7 @@ def update_discoveries(self, discoveries_ids, new_state): bool `True` if the update is successful, `False` otherwise """ - super().update_discoveries( + return super().update_discoveries( discoveries_ids=discoveries_ids, new_state=new_state, query='UPDATE discoveries SET state=? WHERE id IN(' @@ -617,6 +617,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None): query += ' and file_name=?' if snippet is not None: query += ' and snippet=?' - super().update_discovery_group( + return super().update_discovery_group( new_state=new_state, repo_url=repo_url, file_name=file_name, snippet=snippet, query=query) diff --git a/credentialdigger/models/password_model.py b/credentialdigger/models/password_model.py index 89715316..d1325488 100644 --- a/credentialdigger/models/password_model.py +++ b/credentialdigger/models/password_model.py @@ -46,17 +46,18 @@ def analyze_batch(self, discoveries): # We have to classify only the "new" discoveries new_discoveries = [d for d in discoveries if d['state'] == 'new'] no_new_discoveries = [d for d in discoveries if d['state'] != 'new'] - # Create a dataset with all the preprocessed (new) snippets - data = self._pre_process([d['snippet'] for d in new_discoveries]) - # data = self._preprocess_batch_data(snippets) - # Compute a prediction for each snippet - outputs = self.model.predict(data) - logits = outputs['logits'] - predictions = tf.argmax(logits, 1) - # Check predictions and set FP discoveries accordingly - for d, p in zip(new_discoveries, predictions): - if p == 0: - d['state'] = 'false_positive' + # Process new_discoveries if not empty + if new_discoveries: + # Create a dataset with all the preprocessed (new) snippets + data = self._pre_process([d['snippet'] for d in new_discoveries]) + # Compute a prediction for each snippet + outputs = self.model.predict(data) + logits = outputs['logits'] + predictions = tf.argmax(logits, 1) + # Check predictions and set FP discoveries accordingly + for d, p in zip(new_discoveries, predictions): + if p == 0: + d['state'] = 'false_positive' return new_discoveries + no_new_discoveries def analyze(self, discovery): @@ -69,11 +70,9 @@ def analyze(self, discovery): Returns ------- - discoveries: list of dict - The discoveries, with states updated according to - the model's predictions - n_false_positives: int - The number of false positives detected by the model + bool + True if the snippet is safe (i.e., there is no leak). + False otherwise """ # Preprocess the snippet data = self._pre_process([discovery['snippet']]) @@ -84,6 +83,7 @@ def analyze(self, discovery): # The model classified this snippet as a false positive # (i.e., spam) return True + return False def _pre_process(self, snippet): """ Compute encodings of snippets and format them to a standard diff --git a/credentialdigger/models/path_model.py b/credentialdigger/models/path_model.py index 92b7309f..30dce462 100644 --- a/credentialdigger/models/path_model.py +++ b/credentialdigger/models/path_model.py @@ -21,7 +21,8 @@ def analyze(self, discovery): Returns ------- bool - True if the discovery is classified as false positive (i.e., spam) + True if the discovery is classified as false positive (i.e., spam), + False otherwise """ return bool(self.fp_keywords.search(discovery['file_name'].lower())) diff --git a/credentialdigger/scanners/file_scanner.py b/credentialdigger/scanners/file_scanner.py index 104c6aba..dacfa049 100644 --- a/credentialdigger/scanners/file_scanner.py +++ b/credentialdigger/scanners/file_scanner.py @@ -157,7 +157,7 @@ def scan_file(self, project_root, relative_path, **kwargs): for row in file_to_scan: rh = ResultHandler() self.stream.scan( - row if sys.version_info < (3, 9) else row.encode( + row if sys.version_info < (3, 8) else row.encode( 'utf-8'), match_event_handler=rh.handle_results, context=[row.strip(), relative_path, commit_id, @@ -169,6 +169,8 @@ def scan_file(self, project_root, relative_path, **kwargs): except UnicodeDecodeError: # Don't scan binary files pass + except FileNotFoundError: + logger.warning(f'Ignore {relative_path} (file not found)') return discoveries def _prune(self, rel_dir_root, dirs, files, max_depth=-1, ignore_list=[]): diff --git a/credentialdigger/scanners/git_scanner.py b/credentialdigger/scanners/git_scanner.py index d3a79e13..8ab299cd 100644 --- a/credentialdigger/scanners/git_scanner.py +++ b/credentialdigger/scanners/git_scanner.py @@ -100,6 +100,7 @@ def get_git_repo(self, repo_url, local_repo): GitRepo.clone_from(repo_url, project_path) repo = GitRepo(project_path) except GitCommandError as e: + logger.warning('Repo can not be cloned') shutil.rmtree(project_path) raise e @@ -405,7 +406,7 @@ def _regex_check(self, printable_diff, filename, commit_hash): rh = ResultHandler() self.stream.scan( - row if sys.version_info < (3, 9) else row.encode('utf-8'), + row if sys.version_info < (3, 8) else row.encode('utf-8'), match_event_handler=rh.handle_results, context=[row, filename, commit_hash, line_number]) if rh.result: diff --git a/requirements.txt b/requirements.txt index ed27a6ae..ea59d6b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,21 @@ Flask flask_jwt_extended GitPython -hyperscan==0.2.0; python_version >= "3.9" -hyperscan==0.1.5; python_version < "3.9" +hyperscan==0.2.0; python_version >= "3.8" +hyperscan==0.1.5; python_version < "3.8" numpy pandas psycopg2-binary PyGithub python-dotenv pyyaml -rich +rich~=12.2 srsly>=2.4.0 -tensorflow==2.6.2; python_version >= "3.8" -tensorflow==2.4.*; python_version < "3.8" -tensorflow-estimator==2.6.0; python_version >= "3.8" -tensorflow-estimator==2.4.*; python_version < "3.8" -tensorflow-text==2.6.0; python_version >= "3.8" -tensorflow-text==2.4.*; python_version < "3.8" +tensorflow==2.8.0; python_version >= "3.8" +tensorflow~=2.4; python_version < "3.8" +tensorflow-estimator==2.8.0; python_version >= "3.8" +tensorflow-estimator~=2.4; python_version < "3.8" +tensorflow-text==2.8.1; python_version >= "3.8" +tensorflow-text~=2.4; python_version < "3.8" tf-models-official transformers diff --git a/setup.py b/setup.py index 9a0c1149..df501c9c 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def requirements(): setuptools.setup( name='credentialdigger', - version='4.6.1', + version='4.7.0', author='SAP SE', maintainer='Marco Rosa, Slim Trabelsi', maintainer_email='marco.rosa@sap.com, slim.trabelsi@sap.com', diff --git a/tests/functional_tests/test_rules.yml b/tests/functional_tests/test_rules.yml index a6355c21..9891eac8 100644 --- a/tests/functional_tests/test_rules.yml +++ b/tests/functional_tests/test_rules.yml @@ -31,6 +31,6 @@ rules: category: token description: MailChimp API key - - regex: sshpass|password|pwd|passwd|pass + - regex: sshpass|password|pwd|passwd|pass[\W] category: password - description: password keywords \ No newline at end of file + description: password keywords diff --git a/tests/functional_tests/test_scans_postgres.py b/tests/functional_tests/test_scans_postgres.py index df451e60..4bc530c1 100644 --- a/tests/functional_tests/test_scans_postgres.py +++ b/tests/functional_tests/test_scans_postgres.py @@ -7,6 +7,8 @@ from git import Repo as GitRepo from psycopg2 import connect +TOTAL_PW_DISCOVERIES = 11 + class TestScansPostgres(unittest.TestCase): dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env") @@ -35,7 +37,7 @@ def test_scan_github(self): cli.main(["", "scan", "--category", "password", "--dotenv", self.dotenv, "--force", self.repo_url]) - self.assertEqual(cm.exception.code, 9) + self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES) def test_scan_local(self): repo_path = tempfile.mkdtemp() @@ -46,9 +48,9 @@ def test_scan_local(self): "--models", "PathModel", "PasswordModel", "--category", "password", "--force", "--local", repo_path]) - # When using the models, we expect to be left with less than 9 - # discoveries to manually review - self.assertTrue(cm.exception.code < 9) + # When using the models, we expect to be left with less than + # TOTAL_PW_DISCOVERIES discoveries to manually review + self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES) shutil.rmtree(repo_path) diff --git a/tests/functional_tests/test_scans_sqlite.py b/tests/functional_tests/test_scans_sqlite.py index 52179ef9..34887e86 100644 --- a/tests/functional_tests/test_scans_sqlite.py +++ b/tests/functional_tests/test_scans_sqlite.py @@ -6,6 +6,8 @@ from credentialdigger.client_sqlite import SqliteClient from git import Repo as GitRepo +TOTAL_PW_DISCOVERIES = 11 + class TestScansSqlite(unittest.TestCase): repo_url = 'https://github.com/SAP/credential-digger-tests' @@ -27,7 +29,7 @@ def test_scan_github(self): cli.main(["", "scan", "--sqlite", self.db_path, "--category", "password", "--force", self.repo_url]) - self.assertEqual(cm.exception.code, 9) + self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES) def test_scan_local(self): repo_path = os.path.join(self.tmp_path, "tmp_repo") @@ -38,9 +40,9 @@ def test_scan_local(self): "--models", "PathModel", "PasswordModel", "--category", "password", "--force", "--local", repo_path]) - # When using the models, we expect to be left with less than 9 - # discoveries to manually review - self.assertTrue(cm.exception.code < 9) + # When using the models, we expect to be left with less than + # TOTAL_PW_DISCOVERIES discoveries to manually review + self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES) def test_scan_wiki(self): with self.assertRaises(SystemExit) as cm: