Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 27 additions & 271 deletions src/utils/nativeScan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,238 +67,6 @@ const SDK_CONFIG_FILE = 'sdk_config.json';

const DATA_ACCESS_METHODS = new Set(['read_dlo', 'read_dmo', 'write_to_dlo', 'write_to_dmo']);

/**
* Python stdlib module names to exclude from requirements.txt.
*
* Mirrors `sys.stdlib_module_names` from the Python 3.11 runtimes the SDK supports.
* Using the stdlib names of this version; if a package becomes stdlib in a
* later version we still want to drop it (it's not on PyPI and shipping it as a requirement
* would break installs on the runtime).
*/
const PYTHON_STDLIB_MODULES = new Set<string>([
'__future__',

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to maintain stdlib modules list now as pipreqs lib scan takes care of this.

'_ast',
'_thread',
'abc',
'aifc',
'antigravity',
'argparse',
'array',
'ast',
'asynchat',
'asyncio',
'asyncore',
'atexit',
'audioop',
'base64',
'bdb',
'binascii',
'binhex',
'bisect',
'builtins',
'bz2',
'cProfile',
'calendar',
'cgi',
'cgitb',
'chunk',
'cmath',
'cmd',
'code',
'codecs',
'codeop',
'collections',
'colorsys',
'compileall',
'concurrent',
'configparser',
'contextlib',
'contextvars',
'copy',
'copyreg',
'crypt',
'csv',
'ctypes',
'curses',
'dataclasses',
'datetime',
'dbm',
'decimal',
'difflib',
'dis',
'distutils',
'doctest',
'email',
'encodings',
'ensurepip',
'enum',
'errno',
'faulthandler',
'fcntl',
'filecmp',
'fileinput',
'fnmatch',
'fractions',
'ftplib',
'functools',
'gc',
'genericpath',
'getopt',
'getpass',
'gettext',
'glob',
'graphlib',
'grp',
'gzip',
'hashlib',
'heapq',
'hmac',
'html',
'http',
'idlelib',
'imaplib',
'imghdr',
'imp',
'importlib',
'inspect',
'io',
'ipaddress',
'itertools',
'json',
'keyword',
'lib2to3',
'linecache',
'locale',
'logging',
'lzma',
'mailbox',
'mailcap',
'marshal',
'math',
'mimetypes',
'mmap',
'modulefinder',
'msilib',
'msvcrt',
'multiprocessing',
'netrc',
'nis',
'nntplib',
'nt',
'ntpath',
'nturl2path',
'numbers',
'opcode',
'operator',
'optparse',
'os',
'ossaudiodev',
'pathlib',
'pdb',
'pickle',
'pickletools',
'pipes',
'pkgutil',
'platform',
'plistlib',
'poplib',
'posix',
'posixpath',
'pprint',
'profile',
'pstats',
'pty',
'pwd',
'py_compile',
'pyclbr',
'pydoc',
'pydoc_data',
'pyexpat',
'queue',
'quopri',
'random',
're',
'readline',
'reprlib',
'resource',
'rlcompleter',
'runpy',
'sched',
'secrets',
'select',
'selectors',
'shelve',
'shlex',
'shutil',
'signal',
'site',
'smtpd',
'smtplib',
'sndhdr',
'socket',
'socketserver',
'spwd',
'sqlite3',
'sre_compile',
'sre_constants',
'sre_parse',
'ssl',
'stat',
'statistics',
'string',
'stringprep',
'struct',
'subprocess',
'sunau',
'symtable',
'sys',
'sysconfig',
'syslog',
'tabnanny',
'tarfile',
'telnetlib',
'tempfile',
'termios',
'textwrap',
'this',
'threading',
'time',
'timeit',
'tkinter',
'token',
'tokenize',
'tomllib',
'trace',
'traceback',
'tracemalloc',
'tty',
'turtle',
'turtledemo',
'types',
'typing',
'unicodedata',
'unittest',
'urllib',
'uu',
'uuid',
'venv',
'warnings',
'wave',
'weakref',
'webbrowser',
'winreg',
'winsound',
'wsgiref',
'xdrlib',
'xml',
'xmlrpc',
'zipapp',
'zipfile',
'zipimport',
'zlib',
'zoneinfo',
]);

const EXCLUDED_PACKAGES = new Set<string>(['datacustomcode', 'pyspark']);

/** Mirror of `datacustomcode/scan.py:get_sdk_config_path`. */
Expand Down Expand Up @@ -552,52 +320,40 @@ function validateAccessLayer(calls: DataAccessLayerCalls): void {
/**
* Mirror of `datacustomcode/scan.py:ImportVisitor.scan_file_for_imports`.
*
* Extracts top-level import names from `import X[, Y]` and `from X import …` statements,
* dropping stdlib modules, packages starting with `_`, and the SDK's own packages.
* Skips relative imports (`from . import x`) since they aren't PyPI dependencies.
* This function uses pipreqs to scan Python files for external package dependencies.
* - Scans all Python files in the directory
* - Extracts import statements
* - Filters out stdlib modules
* - Filters out local modules
*/
export async function scanFileForImports(filePath: string): Promise<Set<string>> {
const code = await fs.readFile(filePath, 'utf8');
const imports = new Set<string>();
const fileDir = path.dirname(filePath);

// Strip docstrings/triple-quoted blocks and `#`-style comments to avoid false positives.
const stripped = code
.replace(/"""[\s\S]*?"""/g, '')
.replace(/'''[\s\S]*?'''/g, '')
.replace(/(^|\s)#.*$/gm, '$1');
// scan the directory containing the entrypoint.py file
const { execFile } = await import('node:child_process');
const { promisify } = await import('node:util');
const execFileAsync = promisify(execFile);

// `import a, b.c as alias`
const importRegex = /^[ \t]*import[ \t]+([^\n#]+)/gm;
let m: RegExpExecArray | null;
while ((m = importRegex.exec(stripped)) !== null) {
for (const piece of m[1].split(',')) {
const name = piece
.trim()
.split(/\s+as\s+/)[0]
.trim();
if (!name) continue;
addTopLevelImport(name, imports);
try {
const { stdout } = await execFileAsync('pipreqs', ['--print', '--mode', 'no-pin', fileDir]);

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running the python command here as a child process.


const packages = new Set<string>();
for (const line of stdout.split('\n')) {
const pkg = line.trim().toLowerCase();
if (pkg && !EXCLUDED_PACKAGES.has(pkg)) {
packages.add(pkg);
}
}
}

// `from a.b import c[, d]`. Skip relative imports (`from . import …`).
const fromRegex = /^[ \t]*from[ \t]+([^\s]+)[ \t]+import[ \t]/gm;
while ((m = fromRegex.exec(stripped)) !== null) {
const moduleName = m[1].trim();
if (moduleName.startsWith('.')) continue;
addTopLevelImport(moduleName, imports);
return packages;
} catch (error) {
const err = error as { message: string; stderr?: string; stdout?: string };
const details = err.stderr ?? err.stdout ?? err.message;
throw new SfError(
`Failed to scan imports using pipreqs: ${details}. Hint: ensure 'pipreqs' is installed in the Python environment.`,
'PipreqsScanError'
);
}

return imports;
}

function addTopLevelImport(qualified: string, into: Set<string>): void {
const top = qualified.split('.')[0];
if (!top) return;
if (top.startsWith('_')) return;
if (PYTHON_STDLIB_MODULES.has(top)) return;
if (EXCLUDED_PACKAGES.has(top)) return;
into.add(top);
}

/**
Expand Down
43 changes: 43 additions & 0 deletions test/utils/nativeScan.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ describe('nativeScan: scanFileForImports', () => {
'from pyspark.sql.functions import col',
'import _internal',
'from . import sibling',
'from .. import config',
'from datetime import datetime',
'"""docstring with import requests"""',
'# import urllib',
Expand All @@ -267,6 +268,48 @@ describe('nativeScan: scanFileForImports', () => {
expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']);
await fs.rm(tmp, { recursive: true, force: true });
});

it('filters out local modules that exist as .py files in the same directory', async () => {
const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-'));
const file = path.join(tmp, 'entrypoint.py');
await fs.writeFile(path.join(tmp, 'helper.py'), '# local helper module\n');
await fs.writeFile(
file,
['import pandas', 'import numpy', 'import helper', 'from helper import some_function as f'].join('\n')
);
const imports = await scanFileForImports(file);
expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']);
await fs.rm(tmp, { recursive: true, force: true });
});

it('filters out local packages that exist as subdirectories', async () => {
const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-'));
const file = path.join(tmp, 'entrypoint.py');
await fs.mkdir(path.join(tmp, 'utils'), { recursive: true });
await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), '# utils package\n');
await fs.writeFile(path.join(tmp, 'utils', 'helper.py'), 'def process(): pass\n');
await fs.writeFile(file, ['import pandas', 'import numpy', 'from utils import helper', 'import utils'].join('\n'));
const imports = await scanFileForImports(file);
expect([...imports].sort()).to.deep.equal(['numpy', 'pandas']);
await fs.rm(tmp, { recursive: true, force: true });
});

it('filters nested local packages by checking only top-level directory', async () => {
const tmp = await fs.mkdtemp(path.join(os.tmpdir(), 'imports-'));
const file = path.join(tmp, 'entrypoint.py');
// Create deeply nested local package structure
await fs.mkdir(path.join(tmp, 'utils', 'nested', 'deep'), { recursive: true });
await fs.writeFile(path.join(tmp, 'utils', '__init__.py'), '');
await fs.writeFile(path.join(tmp, 'utils', 'nested', '__init__.py'), '');
await fs.writeFile(path.join(tmp, 'utils', 'nested', 'deep', 'module.py'), 'def fn(): pass\n');
await fs.writeFile(
file,
['import pandas', 'from utils.nested.deep import module', 'import utils.nested'].join('\n')
);
const imports = await scanFileForImports(file);
expect([...imports].sort()).to.deep.equal(['pandas']);
await fs.rm(tmp, { recursive: true, force: true });
});
});

describe('nativeScan: writeRequirementsFile', () => {
Expand Down