diff --git a/src/utils/nativeScan.ts b/src/utils/nativeScan.ts index bc47934..8840555 100644 --- a/src/utils/nativeScan.ts +++ b/src/utils/nativeScan.ts @@ -67,238 +67,6 @@ const SDK_CONFIG_FILE = 'sdk_config.json'; const DATA_ACCESS_METHODS = new Set(['read_dlo', 'read_dmo', 'write_to_dlo', 'write_to_dmo']); -/** - * Python stdlib module names to exclude from requirements.txt. - * - * Mirrors `sys.stdlib_module_names` from the Python 3.11 runtimes the SDK supports. - * Using the stdlib names of this version; if a package becomes stdlib in a - * later version we still want to drop it (it's not on PyPI and shipping it as a requirement - * would break installs on the runtime). - */ -const PYTHON_STDLIB_MODULES = new Set([ - '__future__', - '_ast', - '_thread', - 'abc', - 'aifc', - 'antigravity', - 'argparse', - 'array', - 'ast', - 'asynchat', - 'asyncio', - 'asyncore', - 'atexit', - 'audioop', - 'base64', - 'bdb', - 'binascii', - 'binhex', - 'bisect', - 'builtins', - 'bz2', - 'cProfile', - 'calendar', - 'cgi', - 'cgitb', - 'chunk', - 'cmath', - 'cmd', - 'code', - 'codecs', - 'codeop', - 'collections', - 'colorsys', - 'compileall', - 'concurrent', - 'configparser', - 'contextlib', - 'contextvars', - 'copy', - 'copyreg', - 'crypt', - 'csv', - 'ctypes', - 'curses', - 'dataclasses', - 'datetime', - 'dbm', - 'decimal', - 'difflib', - 'dis', - 'distutils', - 'doctest', - 'email', - 'encodings', - 'ensurepip', - 'enum', - 'errno', - 'faulthandler', - 'fcntl', - 'filecmp', - 'fileinput', - 'fnmatch', - 'fractions', - 'ftplib', - 'functools', - 'gc', - 'genericpath', - 'getopt', - 'getpass', - 'gettext', - 'glob', - 'graphlib', - 'grp', - 'gzip', - 'hashlib', - 'heapq', - 'hmac', - 'html', - 'http', - 'idlelib', - 'imaplib', - 'imghdr', - 'imp', - 'importlib', - 'inspect', - 'io', - 'ipaddress', - 'itertools', - 'json', - 'keyword', - 'lib2to3', - 'linecache', - 'locale', - 'logging', - 'lzma', - 'mailbox', - 'mailcap', - 'marshal', - 'math', - 'mimetypes', - 'mmap', - 'modulefinder', - 'msilib', - 'msvcrt', - 'multiprocessing', - 'netrc', - 'nis', - 'nntplib', - 'nt', - 'ntpath', - 'nturl2path', - 'numbers', - 'opcode', - 'operator', - 'optparse', - 'os', - 'ossaudiodev', - 'pathlib', - 'pdb', - 'pickle', - 'pickletools', - 'pipes', - 'pkgutil', - 'platform', - 'plistlib', - 'poplib', - 'posix', - 'posixpath', - 'pprint', - 'profile', - 'pstats', - 'pty', - 'pwd', - 'py_compile', - 'pyclbr', - 'pydoc', - 'pydoc_data', - 'pyexpat', - 'queue', - 'quopri', - 'random', - 're', - 'readline', - 'reprlib', - 'resource', - 'rlcompleter', - 'runpy', - 'sched', - 'secrets', - 'select', - 'selectors', - 'shelve', - 'shlex', - 'shutil', - 'signal', - 'site', - 'smtpd', - 'smtplib', - 'sndhdr', - 'socket', - 'socketserver', - 'spwd', - 'sqlite3', - 'sre_compile', - 'sre_constants', - 'sre_parse', - 'ssl', - 'stat', - 'statistics', - 'string', - 'stringprep', - 'struct', - 'subprocess', - 'sunau', - 'symtable', - 'sys', - 'sysconfig', - 'syslog', - 'tabnanny', - 'tarfile', - 'telnetlib', - 'tempfile', - 'termios', - 'textwrap', - 'this', - 'threading', - 'time', - 'timeit', - 'tkinter', - 'token', - 'tokenize', - 'tomllib', - 'trace', - 'traceback', - 'tracemalloc', - 'tty', - 'turtle', - 'turtledemo', - 'types', - 'typing', - 'unicodedata', - 'unittest', - 'urllib', - 'uu', - 'uuid', - 'venv', - 'warnings', - 'wave', - 'weakref', - 'webbrowser', - 'winreg', - 'winsound', - 'wsgiref', - 'xdrlib', - 'xml', - 'xmlrpc', - 'zipapp', - 'zipfile', - 'zipimport', - 'zlib', - 'zoneinfo', -]); - const EXCLUDED_PACKAGES = new Set(['datacustomcode', 'pyspark']); /** Mirror of `datacustomcode/scan.py:get_sdk_config_path`. */ @@ -552,52 +320,40 @@ function validateAccessLayer(calls: DataAccessLayerCalls): void { /** * Mirror of `datacustomcode/scan.py:ImportVisitor.scan_file_for_imports`. * - * Extracts top-level import names from `import X[, Y]` and `from X import …` statements, - * dropping stdlib modules, packages starting with `_`, and the SDK's own packages. - * Skips relative imports (`from . import x`) since they aren't PyPI dependencies. + * This function uses pipreqs to scan Python files for external package dependencies. + * - Scans all Python files in the directory + * - Extracts import statements + * - Filters out stdlib modules + * - Filters out local modules */ export async function scanFileForImports(filePath: string): Promise> { - const code = await fs.readFile(filePath, 'utf8'); - const imports = new Set(); + const fileDir = path.dirname(filePath); - // Strip docstrings/triple-quoted blocks and `#`-style comments to avoid false positives. - const stripped = code - .replace(/"""[\s\S]*?"""/g, '') - .replace(/'''[\s\S]*?'''/g, '') - .replace(/(^|\s)#.*$/gm, '$1'); + // scan the directory containing the entrypoint.py file + const { execFile } = await import('node:child_process'); + const { promisify } = await import('node:util'); + const execFileAsync = promisify(execFile); - // `import a, b.c as alias` - const importRegex = /^[ \t]*import[ \t]+([^\n#]+)/gm; - let m: RegExpExecArray | null; - while ((m = importRegex.exec(stripped)) !== null) { - for (const piece of m[1].split(',')) { - const name = piece - .trim() - .split(/\s+as\s+/)[0] - .trim(); - if (!name) continue; - addTopLevelImport(name, imports); + try { + const { stdout } = await execFileAsync('pipreqs', ['--print', '--mode', 'no-pin', fileDir]); + + const packages = new Set(); + for (const line of stdout.split('\n')) { + const pkg = line.trim().toLowerCase(); + if (pkg && !EXCLUDED_PACKAGES.has(pkg)) { + packages.add(pkg); + } } - } - // `from a.b import c[, d]`. Skip relative imports (`from . import …`). - const fromRegex = /^[ \t]*from[ \t]+([^\s]+)[ \t]+import[ \t]/gm; - while ((m = fromRegex.exec(stripped)) !== null) { - const moduleName = m[1].trim(); - if (moduleName.startsWith('.')) continue; - addTopLevelImport(moduleName, imports); + return packages; + } catch (error) { + const err = error as { message: string; stderr?: string; stdout?: string }; + const details = err.stderr ?? err.stdout ?? err.message; + throw new SfError( + `Failed to scan imports using pipreqs: ${details}. Hint: ensure 'pipreqs' is installed in the Python environment.`, + 'PipreqsScanError' + ); } - - return imports; -} - -function addTopLevelImport(qualified: string, into: Set): void { - const top = qualified.split('.')[0]; - if (!top) return; - if (top.startsWith('_')) return; - if (PYTHON_STDLIB_MODULES.has(top)) return; - if (EXCLUDED_PACKAGES.has(top)) return; - into.add(top); } /** diff --git a/test/utils/nativeScan.test.ts b/test/utils/nativeScan.test.ts index f448db3..a807b55 100644 --- a/test/utils/nativeScan.test.ts +++ b/test/utils/nativeScan.test.ts @@ -249,6 +249,7 @@ describe('nativeScan: scanFileForImports', () => { 'from pyspark.sql.functions import col', 'import _internal', 'from . import sibling', + 'from .. import config', 'from datetime import datetime', '"""docstring with import requests"""', '# import urllib', @@ -267,6 +268,48 @@ describe('nativeScan: scanFileForImports', () => { expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); await fs.rm(tmp, { recursive: true, force: true }); }); + + it('filters out local modules that exist as .py files in the same directory', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + await fs.writeFile(path.join(tmp, 'helper.py'), '# local helper module\n'); + await fs.writeFile( + file, + ['import pandas', 'import numpy', 'import helper', 'from helper import some_function as f'].join('\n') + ); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); + + it('filters out local packages that exist as subdirectories', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + await fs.mkdir(path.join(tmp, 'utils'), { recursive: true }); + await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), '# utils package\n'); + await fs.writeFile(path.join(tmp, 'utils', 'helper.py'), 'def process(): pass\n'); + await fs.writeFile(file, ['import pandas', 'import numpy', 'from utils import helper', 'import utils'].join('\n')); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); + + it('filters nested local packages by checking only top-level directory', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + // Create deeply nested local package structure + await fs.mkdir(path.join(tmp, 'utils', 'nested', 'deep'), { recursive: true }); + await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), ''); + await fs.writeFile(path.join(tmp, 'utils', 'nested', '__init__.py'), ''); + await fs.writeFile(path.join(tmp, 'utils', 'nested', 'deep', 'module.py'), 'def fn(): pass\n'); + await fs.writeFile( + file, + ['import pandas', 'from utils.nested.deep import module', 'import utils.nested'].join('\n') + ); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); }); describe('nativeScan: writeRequirementsFile', () => {