From c7edae232fbf581c6113e2bc83fddaafdfc597bc Mon Sep 17 00:00:00 2001 From: Diksha Date: Tue, 9 Jun 2026 12:44:10 +0530 Subject: [PATCH 1/3] fix: exclude local modules from scan command --- src/utils/nativeScan.ts | 19 ++++++++++++++++++- test/utils/nativeScan.test.ts | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/utils/nativeScan.ts b/src/utils/nativeScan.ts index bc47934..0ac9673 100644 --- a/src/utils/nativeScan.ts +++ b/src/utils/nativeScan.ts @@ -588,7 +588,24 @@ export async function scanFileForImports(filePath: string): Promise> addTopLevelImport(moduleName, imports); } - return imports; + // Filter out local modules + const fileDir = path.dirname(filePath); + const checks = await Promise.all( + Array.from(imports).map(async (pkg) => ({ + pkg, + isLocal: await pathExists(path.join(fileDir, `${pkg}.py`)), + })) + ); + + const filteredImports = new Set(); + for (const { pkg, isLocal } of checks) { + if (!isLocal) { + // Not a local module, keep it in the imports + filteredImports.add(pkg); + } + } + + return filteredImports; } function addTopLevelImport(qualified: string, into: Set): void { diff --git a/test/utils/nativeScan.test.ts b/test/utils/nativeScan.test.ts index f448db3..1084f3c 100644 --- a/test/utils/nativeScan.test.ts +++ b/test/utils/nativeScan.test.ts @@ -267,6 +267,25 @@ describe('nativeScan: scanFileForImports', () => { expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); await fs.rm(tmp, { recursive: true, force: true }); }); + + it('filters out local modules that exist as .py files in the same directory', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + // Create a local module file that should be filtered out + await fs.writeFile(path.join(tmp, 'helper.py'), '# local helper module\n'); + await fs.writeFile( + file, + [ + 'import pandas', + 'import numpy', + 'import helper', // local module - should be filtered + 'from helper import some_function', // also local - should be filtered + ].join('\n') + ); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); }); describe('nativeScan: writeRequirementsFile', () => { From db04377b8835154ffef574e6d98e98bd2adbbb3c Mon Sep 17 00:00:00 2001 From: Diksha Date: Tue, 9 Jun 2026 16:18:14 +0530 Subject: [PATCH 2/3] fix: include directories and subdirectories for local module detection --- src/utils/nativeScan.ts | 29 +++++++++++++++++++++++++- test/utils/nativeScan.test.ts | 39 ++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/utils/nativeScan.ts b/src/utils/nativeScan.ts index 0ac9673..5890c57 100644 --- a/src/utils/nativeScan.ts +++ b/src/utils/nativeScan.ts @@ -593,7 +593,7 @@ export async function scanFileForImports(filePath: string): Promise> const checks = await Promise.all( Array.from(imports).map(async (pkg) => ({ pkg, - isLocal: await pathExists(path.join(fileDir, `${pkg}.py`)), + isLocal: await isLocalModule(pkg, fileDir), })) ); @@ -777,6 +777,33 @@ async function pathExists(p: string): Promise { } } +/** + * Check if an import name resolves to a local module in the same directory. + * 1. pkg.py - module file + * 2. pkg/ directory with .py files - package (with or without __init__.py) + */ +async function isLocalModule(pkg: string, fileDir: string): Promise { + const [isModuleFile, isPackageDir] = await Promise.all([ + pathExists(path.join(fileDir, `${pkg}.py`)), + (async (): Promise => { + const pkgDir = path.join(fileDir, pkg); + try { + const stat = await fs.stat(pkgDir); + if (!stat.isDirectory()) { + return false; + } + // Check if directory contains any Python files + const entries = await fs.readdir(pkgDir); + return entries.some((entry) => entry.endsWith('.py')); + } catch { + return false; + } + })(), + ]); + + return isModuleFile || isPackageDir; +} + function dirExistsSync(p: string): boolean { try { accessSync(p); diff --git a/test/utils/nativeScan.test.ts b/test/utils/nativeScan.test.ts index 1084f3c..e2d57e3 100644 --- a/test/utils/nativeScan.test.ts +++ b/test/utils/nativeScan.test.ts @@ -249,6 +249,8 @@ describe('nativeScan: scanFileForImports', () => { 'from pyspark.sql.functions import col', 'import _internal', 'from . import sibling', + 'from .. import config', + 'from .utils import helper', 'from datetime import datetime', '"""docstring with import requests"""', '# import urllib', @@ -271,21 +273,44 @@ describe('nativeScan: scanFileForImports', () => { it('filters out local modules that exist as .py files in the same directory', async () => { const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); const file = path.join(tmp, 'entrypoint.py'); - // Create a local module file that should be filtered out await fs.writeFile(path.join(tmp, 'helper.py'), '# local helper module\n'); await fs.writeFile( file, - [ - 'import pandas', - 'import numpy', - 'import helper', // local module - should be filtered - 'from helper import some_function', // also local - should be filtered - ].join('\n') + ['import pandas', 'import numpy', 'import helper', 'from helper import some_function as f'].join('\n') ); const imports = await scanFileForImports(file); expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); await fs.rm(tmp, { recursive: true, force: true }); }); + + it('filters out local packages that exist as subdirectories', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + await fs.mkdir(path.join(tmp, 'utils'), { recursive: true }); + await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), '# utils package\n'); + await fs.writeFile(path.join(tmp, 'utils', 'helper.py'), 'def process(): pass\n'); + await fs.writeFile(file, ['import pandas', 'import numpy', 'from utils import helper', 'import utils'].join('\n')); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); + + it('filters nested local packages by checking only top-level directory', async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-')); + const file = path.join(tmp, 'entrypoint.py'); + // Create deeply nested local package structure + await fs.mkdir(path.join(tmp, 'utils', 'nested', 'deep'), { recursive: true }); + await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), ''); + await fs.writeFile(path.join(tmp, 'utils', 'nested', '__init__.py'), ''); + await fs.writeFile(path.join(tmp, 'utils', 'nested', 'deep', 'module.py'), 'def fn(): pass\n'); + await fs.writeFile( + file, + ['import pandas', 'from utils.nested.deep import module', 'import utils.nested'].join('\n') + ); + const imports = await scanFileForImports(file); + expect([...imports].sort()).to.deep.equal(['pandas']); + await fs.rm(tmp, { recursive: true, force: true }); + }); }); describe('nativeScan: writeRequirementsFile', () => { From f1b53ebbe3b9e4983b09127376b37ca5c107d5af Mon Sep 17 00:00:00 2001 From: Diksha Date: Wed, 10 Jun 2026 19:33:27 +0530 Subject: [PATCH 3/3] fix: updated impl to use pipreqs --- src/utils/nativeScan.ts | 342 +++------------------------------- test/utils/nativeScan.test.ts | 1 - 2 files changed, 27 insertions(+), 316 deletions(-) diff --git a/src/utils/nativeScan.ts b/src/utils/nativeScan.ts index 5890c57..8840555 100644 --- a/src/utils/nativeScan.ts +++ b/src/utils/nativeScan.ts @@ -67,238 +67,6 @@ const SDK_CONFIG_FILE = 'sdk_config.json'; const DATA_ACCESS_METHODS = new Set(['read_dlo', 'read_dmo', 'write_to_dlo', 'write_to_dmo']); -/** - * Python stdlib module names to exclude from requirements.txt. - * - * Mirrors `sys.stdlib_module_names` from the Python 3.11 runtimes the SDK supports. - * Using the stdlib names of this version; if a package becomes stdlib in a - * later version we still want to drop it (it's not on PyPI and shipping it as a requirement - * would break installs on the runtime). - */ -const PYTHON_STDLIB_MODULES = new Set([ - '__future__', - '_ast', - '_thread', - 'abc', - 'aifc', - 'antigravity', - 'argparse', - 'array', - 'ast', - 'asynchat', - 'asyncio', - 'asyncore', - 'atexit', - 'audioop', - 'base64', - 'bdb', - 'binascii', - 'binhex', - 'bisect', - 'builtins', - 'bz2', - 'cProfile', - 'calendar', - 'cgi', - 'cgitb', - 'chunk', - 'cmath', - 'cmd', - 'code', - 'codecs', - 'codeop', - 'collections', - 'colorsys', - 'compileall', - 'concurrent', - 'configparser', - 'contextlib', - 'contextvars', - 'copy', - 'copyreg', - 'crypt', - 'csv', - 'ctypes', - 'curses', - 'dataclasses', - 'datetime', - 'dbm', - 'decimal', - 'difflib', - 'dis', - 'distutils', - 'doctest', - 'email', - 'encodings', - 'ensurepip', - 'enum', - 'errno', - 'faulthandler', - 'fcntl', - 'filecmp', - 'fileinput', - 'fnmatch', - 'fractions', - 'ftplib', - 'functools', - 'gc', - 'genericpath', - 'getopt', - 'getpass', - 'gettext', - 'glob', - 'graphlib', - 'grp', - 'gzip', - 'hashlib', - 'heapq', - 'hmac', - 'html', - 'http', - 'idlelib', - 'imaplib', - 'imghdr', - 'imp', - 'importlib', - 'inspect', - 'io', - 'ipaddress', - 'itertools', - 'json', - 'keyword', - 'lib2to3', - 'linecache', - 'locale', - 'logging', - 'lzma', - 'mailbox', - 'mailcap', - 'marshal', - 'math', - 'mimetypes', - 'mmap', - 'modulefinder', - 'msilib', - 'msvcrt', - 'multiprocessing', - 'netrc', - 'nis', - 'nntplib', - 'nt', - 'ntpath', - 'nturl2path', - 'numbers', - 'opcode', - 'operator', - 'optparse', - 'os', - 'ossaudiodev', - 'pathlib', - 'pdb', - 'pickle', - 'pickletools', - 'pipes', - 'pkgutil', - 'platform', - 'plistlib', - 'poplib', - 'posix', - 'posixpath', - 'pprint', - 'profile', - 'pstats', - 'pty', - 'pwd', - 'py_compile', - 'pyclbr', - 'pydoc', - 'pydoc_data', - 'pyexpat', - 'queue', - 'quopri', - 'random', - 're', - 'readline', - 'reprlib', - 'resource', - 'rlcompleter', - 'runpy', - 'sched', - 'secrets', - 'select', - 'selectors', - 'shelve', - 'shlex', - 'shutil', - 'signal', - 'site', - 'smtpd', - 'smtplib', - 'sndhdr', - 'socket', - 'socketserver', - 'spwd', - 'sqlite3', - 'sre_compile', - 'sre_constants', - 'sre_parse', - 'ssl', - 'stat', - 'statistics', - 'string', - 'stringprep', - 'struct', - 'subprocess', - 'sunau', - 'symtable', - 'sys', - 'sysconfig', - 'syslog', - 'tabnanny', - 'tarfile', - 'telnetlib', - 'tempfile', - 'termios', - 'textwrap', - 'this', - 'threading', - 'time', - 'timeit', - 'tkinter', - 'token', - 'tokenize', - 'tomllib', - 'trace', - 'traceback', - 'tracemalloc', - 'tty', - 'turtle', - 'turtledemo', - 'types', - 'typing', - 'unicodedata', - 'unittest', - 'urllib', - 'uu', - 'uuid', - 'venv', - 'warnings', - 'wave', - 'weakref', - 'webbrowser', - 'winreg', - 'winsound', - 'wsgiref', - 'xdrlib', - 'xml', - 'xmlrpc', - 'zipapp', - 'zipfile', - 'zipimport', - 'zlib', - 'zoneinfo', -]); - const EXCLUDED_PACKAGES = new Set(['datacustomcode', 'pyspark']); /** Mirror of `datacustomcode/scan.py:get_sdk_config_path`. */ @@ -552,69 +320,40 @@ function validateAccessLayer(calls: DataAccessLayerCalls): void { /** * Mirror of `datacustomcode/scan.py:ImportVisitor.scan_file_for_imports`. * - * Extracts top-level import names from `import X[, Y]` and `from X import …` statements, - * dropping stdlib modules, packages starting with `_`, and the SDK's own packages. - * Skips relative imports (`from . import x`) since they aren't PyPI dependencies. + * This function uses pipreqs to scan Python files for external package dependencies. + * - Scans all Python files in the directory + * - Extracts import statements + * - Filters out stdlib modules + * - Filters out local modules */ export async function scanFileForImports(filePath: string): Promise> { - const code = await fs.readFile(filePath, 'utf8'); - const imports = new Set(); - - // Strip docstrings/triple-quoted blocks and `#`-style comments to avoid false positives. - const stripped = code - .replace(/"""[\s\S]*?"""/g, '') - .replace(/'''[\s\S]*?'''/g, '') - .replace(/(^|\s)#.*$/gm, '$1'); + const fileDir = path.dirname(filePath); - // `import a, b.c as alias` - const importRegex = /^[ \t]*import[ \t]+([^\n#]+)/gm; - let m: RegExpExecArray | null; - while ((m = importRegex.exec(stripped)) !== null) { - for (const piece of m[1].split(',')) { - const name = piece - .trim() - .split(/\s+as\s+/)[0] - .trim(); - if (!name) continue; - addTopLevelImport(name, imports); - } - } + // scan the directory containing the entrypoint.py file + const { execFile } = await import('node:child_process'); + const { promisify } = await import('node:util'); + const execFileAsync = promisify(execFile); - // `from a.b import c[, d]`. Skip relative imports (`from . import …`). - const fromRegex = /^[ \t]*from[ \t]+([^\s]+)[ \t]+import[ \t]/gm; - while ((m = fromRegex.exec(stripped)) !== null) { - const moduleName = m[1].trim(); - if (moduleName.startsWith('.')) continue; - addTopLevelImport(moduleName, imports); - } + try { + const { stdout } = await execFileAsync('pipreqs', ['--print', '--mode', 'no-pin', fileDir]); - // Filter out local modules - const fileDir = path.dirname(filePath); - const checks = await Promise.all( - Array.from(imports).map(async (pkg) => ({ - pkg, - isLocal: await isLocalModule(pkg, fileDir), - })) - ); - - const filteredImports = new Set(); - for (const { pkg, isLocal } of checks) { - if (!isLocal) { - // Not a local module, keep it in the imports - filteredImports.add(pkg); + const packages = new Set(); + for (const line of stdout.split('\n')) { + const pkg = line.trim().toLowerCase(); + if (pkg && !EXCLUDED_PACKAGES.has(pkg)) { + packages.add(pkg); + } } - } - - return filteredImports; -} -function addTopLevelImport(qualified: string, into: Set): void { - const top = qualified.split('.')[0]; - if (!top) return; - if (top.startsWith('_')) return; - if (PYTHON_STDLIB_MODULES.has(top)) return; - if (EXCLUDED_PACKAGES.has(top)) return; - into.add(top); + return packages; + } catch (error) { + const err = error as { message: string; stderr?: string; stdout?: string }; + const details = err.stderr ?? err.stdout ?? err.message; + throw new SfError( + `Failed to scan imports using pipreqs: ${details}. Hint: ensure 'pipreqs' is installed in the Python environment.`, + 'PipreqsScanError' + ); + } } /** @@ -777,33 +516,6 @@ async function pathExists(p: string): Promise { } } -/** - * Check if an import name resolves to a local module in the same directory. - * 1. pkg.py - module file - * 2. pkg/ directory with .py files - package (with or without __init__.py) - */ -async function isLocalModule(pkg: string, fileDir: string): Promise { - const [isModuleFile, isPackageDir] = await Promise.all([ - pathExists(path.join(fileDir, `${pkg}.py`)), - (async (): Promise => { - const pkgDir = path.join(fileDir, pkg); - try { - const stat = await fs.stat(pkgDir); - if (!stat.isDirectory()) { - return false; - } - // Check if directory contains any Python files - const entries = await fs.readdir(pkgDir); - return entries.some((entry) => entry.endsWith('.py')); - } catch { - return false; - } - })(), - ]); - - return isModuleFile || isPackageDir; -} - function dirExistsSync(p: string): boolean { try { accessSync(p); diff --git a/test/utils/nativeScan.test.ts b/test/utils/nativeScan.test.ts index e2d57e3..a807b55 100644 --- a/test/utils/nativeScan.test.ts +++ b/test/utils/nativeScan.test.ts @@ -250,7 +250,6 @@ describe('nativeScan: scanFileForImports', () => { 'import _internal', 'from . import sibling', 'from .. import config', - 'from .utils import helper', 'from datetime import datetime', '"""docstring with import requests"""', '# import urllib',