diff --git a/rust/Cargo.toml b/rust/Cargo.toml --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,3 +1,3 @@ [workspace] members = ["hg-core", "hg-direct-ffi", "hg-cpython"] -exclude = ["chg", "hgcli"] +exclude = ["chg", "hgcli", "pyembed"] diff --git a/rust/pyembed/Cargo.toml b/rust/pyembed/Cargo.toml new file mode 100644 --- /dev/null +++ b/rust/pyembed/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "pyembed" +version = "0.3.0" +authors = ["Gregory Szorc "] +edition = "2018" +build = "build.rs" + +[dependencies] +byteorder = "1" +jemalloc-sys = { version = "0.3", optional = true } +lazy_static = "1.3" +libc = "0.2" +uuid = { version = "0.7", features = ["v4"] } + +[dependencies.python3-sys] +git = "https://github.com/indygreg/PyOxidizer.git" +tag = "v0.3.0" + +[dependencies.cpython] +git = "https://github.com/indygreg/PyOxidizer.git" +tag = "v0.3.0" +features = ["link-mode-unresolved-static", "python3-sys", "no-auto-initialize"] + +[features] +default = [] +jemalloc = ["jemalloc-sys"] diff --git a/rust/pyembed/build.rs b/rust/pyembed/build.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/build.rs @@ -0,0 +1,65 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::env; +use std::path::PathBuf; +use std::process; + +/// Path to pyoxidizer executable this file was created with. +const DEFAULT_PYOXIDIZER_EXE: &str = r#"/Users/gps/.cargo/bin/pyoxidizer"#; + +fn main() { + // We support using pre-built artifacts, in which case we emit the + // cargo metadata lines from the "original" build to "register" the + // artifacts with this cargo invocation. + if env::var("PYOXIDIZER_REUSE_ARTIFACTS").is_ok() { + let artifact_dir_env = env::var("PYOXIDIZER_ARTIFACT_DIR"); + + let artifact_dir_path = match artifact_dir_env { + Ok(ref v) => PathBuf::from(v), + Err(_) => { + let out_dir = env::var("OUT_DIR").unwrap(); + PathBuf::from(&out_dir) + } + }; + + println!( + "using pre-built artifacts from {}", + artifact_dir_path.display() + ); + + println!("cargo:rerun-if-env-changed=PYOXIDIZER_REUSE_ARTIFACTS"); + println!("cargo:rerun-if-env-changed=PYOXIDIZER_ARTIFACT_DIR"); + + // Emit the cargo metadata lines to register libraries for linking. + let cargo_metadata_path = artifact_dir_path.join("cargo_metadata.txt"); + let metadata = std::fs::read_to_string(&cargo_metadata_path) + .expect(format!("failed to read {}", cargo_metadata_path.display()).as_str()); + println!("{}", metadata); + } else { + let pyoxidizer_exe = match env::var("PYOXIDIZER_EXE") { + Ok(value) => value, + Err(_) => DEFAULT_PYOXIDIZER_EXE.to_string(), + }; + + let pyoxidizer_path = PathBuf::from(&pyoxidizer_exe); + + if !pyoxidizer_path.exists() { + panic!("pyoxidizer executable does not exist: {}", &pyoxidizer_exe); + } + + match process::Command::new(&pyoxidizer_exe) + .arg("run-build-script") + .arg("build.rs") + .status() + { + Ok(status) => { + if !status.success() { + panic!("`pyoxidizer run-build-script` failed"); + } + } + Err(e) => panic!("`pyoxidizer run-build-script` failed: {}", e.to_string()), + } + } +} diff --git a/rust/pyembed/src/config.rs b/rust/pyembed/src/config.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/config.rs @@ -0,0 +1,159 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Data structures for configuring a Python interpreter. + +use python3_sys as pyffi; +use std::ffi::CString; + +/// Defines which allocator to use for the raw domain. +#[derive(Clone, Debug)] +pub enum PythonRawAllocator { + /// Use jemalloc. + Jemalloc, + /// Use the Rust global allocator. + Rust, + /// Use the system allocator. + System, +} + +/// Defines Python code to run. +#[derive(Clone, Debug)] +pub enum PythonRunMode { + /// No-op. + None, + /// Run a Python REPL. + Repl, + /// Run a Python module as the main module. + Module { module: String }, + /// Evaluate Python code from a string. + Eval { code: String }, +} + +/// Defines `terminfo`` database resolution semantics. +#[derive(Clone, Debug)] +pub enum TerminfoResolution { + /// Resolve `terminfo` database using appropriate behavior for current OS. + Dynamic, + /// Do not attempt to resolve the `terminfo` database. Basically a no-op. + None, + /// Use a specified string as the `TERMINFO_DIRS` value. + Static(String), +} + +/// Defines an extra extension module to load. +#[derive(Clone, Debug)] +pub struct ExtensionModule { + /// Name of the extension module. + pub name: CString, + + /// Extension module initialization function. + pub init_func: unsafe extern "C" fn() -> *mut pyffi::PyObject, +} + +/// Holds the configuration of an embedded Python interpreter. +/// +/// Instances of this struct can be used to construct Python interpreters. +/// +/// Each instance contains the total state to define the run-time behavior of +/// a Python interpreter. +#[derive(Clone, Debug)] +pub struct PythonConfig { + /// Name of encoding for stdio handles. + pub standard_io_encoding: Option, + + /// Name of encoding error mode for stdio handles. + pub standard_io_errors: Option, + + /// Python optimization level. + pub opt_level: i32, + + /// Whether to load our custom frozen importlib bootstrap modules. + pub use_custom_importlib: bool, + + /// Whether to load the filesystem-based sys.meta_path finder. + pub filesystem_importer: bool, + + /// Filesystem paths to add to sys.path. + /// + /// ``$ORIGIN`` will resolve to the directory of the application at + /// run-time. + pub sys_paths: Vec, + + /// Whether to load the site.py module at initialization time. + pub import_site: bool, + + /// Whether to load a user-specific site module at initialization time. + pub import_user_site: bool, + + /// Whether to ignore various PYTHON* environment variables. + pub ignore_python_env: bool, + + /// Whether to suppress writing of ``.pyc`` files when importing ``.py`` + /// files from the filesystem. This is typically irrelevant since modules + /// are imported from memory. + pub dont_write_bytecode: bool, + + /// Whether stdout and stderr streams should be unbuffered. + pub unbuffered_stdio: bool, + + /// Bytecode for the importlib._bootstrap / _frozen_importlib module. + pub frozen_importlib_data: &'static [u8], + + /// Bytecode for the importlib._bootstrap_external / _frozen_importlib_external module. + pub frozen_importlib_external_data: &'static [u8], + + /// Reference to raw Python modules data. + /// + /// The referenced data is produced as part of PyOxidizer packaging. This + /// likely comes from an include_bytes!(...) of a file generated by PyOxidizer. + pub py_modules_data: &'static [u8], + + /// Reference to raw Python resources data. + /// + /// The referenced data is produced as part of PyOxidizer packaging. This + /// likely comes from an include_bytes!(...) of a file generated by PyOxidizer. + pub py_resources_data: &'static [u8], + + /// Extra extension modules to make available to the interpreter. + /// + /// The values will effectively be passed to ``PyImport_ExtendInitTab()``. + pub extra_extension_modules: Vec, + + /// Whether to set sys.argvb with bytes versions of process arguments. + /// + /// On Windows, bytes will be UTF-16. On POSIX, bytes will be raw char* + /// values passed to `int main()`. + pub argvb: bool, + + /// Whether to set sys.frozen=True. + /// + /// Setting this will enable Python to emulate "frozen" binaries, such as + /// those used by PyInstaller. + pub sys_frozen: bool, + + /// Whether to set sys._MEIPASS to the directory of the executable. + /// + /// Setting this will enable Python to emulate PyInstaller's behavior + /// of setting this attribute. + pub sys_meipass: bool, + + /// Which memory allocator to use for the raw domain. + pub raw_allocator: PythonRawAllocator, + + /// How to resolve the `terminfo` database. + pub terminfo_resolution: TerminfoResolution, + + /// Environment variable holding the directory to write a loaded modules file. + /// + /// If this value is set and the environment it refers to is set, + /// on interpreter shutdown, we will write a ``modules-`` file to + /// the directory specified containing a ``\n`` delimited list of modules + /// loaded in ``sys.modules``. + pub write_modules_directory_env: Option, + + /// Defines what code to run by default. + /// + pub run: PythonRunMode, +} diff --git a/rust/pyembed/src/data.rs b/rust/pyembed/src/data.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/data.rs @@ -0,0 +1,5 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +include!(env!("PYEMBED_DATA_RS_PATH")); diff --git a/rust/pyembed/src/importer.rs b/rust/pyembed/src/importer.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/importer.rs @@ -0,0 +1,911 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +/*! +Functionality for a Python importer. + +This module defines a Python meta path importer and associated functionality +for importing Python modules from memory. +*/ + +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::ffi::CStr; +use std::io::Cursor; +use std::sync::Arc; + +use byteorder::{LittleEndian, ReadBytesExt}; +use cpython::exc::{FileNotFoundError, ImportError, RuntimeError, ValueError}; +use cpython::{ + py_class, py_class_impl, py_coerce_item, py_fn, NoArgs, ObjectProtocol, PyClone, PyDict, PyErr, + PyList, PyModule, PyObject, PyResult, PyString, PyTuple, Python, PythonObject, ToPyObject, +}; +use python3_sys as pyffi; +use python3_sys::{PyBUF_READ, PyMemoryView_FromMemory}; + +use super::pyinterp::PYOXIDIZER_IMPORTER_NAME; + +/// Obtain a Python memoryview referencing a memory slice. +/// +/// New memoryview allows Python to access the underlying memory without +/// copying it. +#[inline] +fn get_memory_view(py: Python, data: &'static [u8]) -> Option { + let ptr = unsafe { PyMemoryView_FromMemory(data.as_ptr() as _, data.len() as _, PyBUF_READ) }; + unsafe { PyObject::from_owned_ptr_opt(py, ptr) } +} + +/// Holds pointers to Python module data in memory. +#[derive(Debug)] +struct PythonModuleData { + source: Option<&'static [u8]>, + bytecode: Option<&'static [u8]>, +} + +impl PythonModuleData { + /// Obtain a PyMemoryView instance for source data. + fn get_source_memory_view(&self, py: Python) -> Option { + match self.source { + Some(data) => get_memory_view(py, data), + None => None, + } + } + + /// Obtain a PyMemoryView instance for bytecode data. + fn get_bytecode_memory_view(&self, py: Python) -> Option { + match self.bytecode { + Some(data) => get_memory_view(py, data), + None => None, + } + } +} + +/// Represents Python modules data in memory. +/// +/// This is essentially an index over a raw backing blob. +struct PythonModulesData { + /// Packages in this set of modules. + packages: HashSet<&'static str>, + + /// Maps module name to source/bytecode. + data: HashMap<&'static str, PythonModuleData>, +} + +impl PythonModulesData { + /// Construct a new instance from a memory slice. + fn from(data: &'static [u8]) -> Result { + let mut reader = Cursor::new(data); + + let count = reader + .read_u32::() + .or_else(|_| Err("failed reading count"))?; + + let mut index = Vec::with_capacity(count as usize); + let mut total_names_length = 0; + let mut total_sources_length = 0; + let mut package_count = 0; + + for _ in 0..count { + let name_length = reader + .read_u32::() + .or_else(|_| Err("failed reading name length"))? + as usize; + let source_length = reader + .read_u32::() + .or_else(|_| Err("failed reading source length"))? + as usize; + let bytecode_length = reader + .read_u32::() + .or_else(|_| Err("failed reading bytecode length"))? + as usize; + let flags = reader + .read_u32::() + .or_else(|_| Err("failed reading module flags"))?; + + let is_package = flags & 0x01 != 0; + + if is_package { + package_count += 1; + } + + index.push((name_length, source_length, bytecode_length, is_package)); + total_names_length += name_length; + total_sources_length += source_length; + } + + let mut res = HashMap::with_capacity(count as usize); + let mut packages = HashSet::with_capacity(package_count); + let sources_start_offset = reader.position() as usize + total_names_length; + let bytecodes_start_offset = sources_start_offset + total_sources_length; + + let mut sources_current_offset: usize = 0; + let mut bytecodes_current_offset: usize = 0; + + for (name_length, source_length, bytecode_length, is_package) in index { + let offset = reader.position() as usize; + + let name = + unsafe { std::str::from_utf8_unchecked(&data[offset..offset + name_length]) }; + + let source_offset = sources_start_offset + sources_current_offset; + let source = if source_length > 0 { + Some(&data[source_offset..source_offset + source_length]) + } else { + None + }; + + let bytecode_offset = bytecodes_start_offset + bytecodes_current_offset; + let bytecode = if bytecode_length > 0 { + Some(&data[bytecode_offset..bytecode_offset + bytecode_length]) + } else { + None + }; + + reader.set_position(offset as u64 + name_length as u64); + + sources_current_offset += source_length; + bytecodes_current_offset += bytecode_length; + + if is_package { + packages.insert(name); + } + + // Extension modules will have their names present to populate the + // packages set. So only populate module data if we have data for it. + if source.is_some() || bytecode.is_some() { + res.insert(name, PythonModuleData { source, bytecode }); + } + } + + Ok(PythonModulesData { + packages, + data: res, + }) + } +} + +/// Represents Python resources data in memory. +/// +/// This is essentially an index over a raw backing blob. +struct PythonResourcesData { + packages: HashMap<&'static str, Arc>>>, +} + +impl PythonResourcesData { + fn from(data: &'static [u8]) -> Result { + let mut reader = Cursor::new(data); + + let package_count = reader + .read_u32::() + .or_else(|_| Err("failed reading package count"))? as usize; + + let mut index = Vec::with_capacity(package_count); + let mut total_names_length = 0; + + for _ in 0..package_count { + let package_name_length = reader + .read_u32::() + .or_else(|_| Err("failed reading package name length"))? + as usize; + let resource_count = reader + .read_u32::() + .or_else(|_| Err("failed reading resource count"))? + as usize; + + total_names_length += package_name_length; + + let mut package_index = Vec::with_capacity(resource_count); + + for _ in 0..resource_count { + let resource_name_length = reader + .read_u32::() + .or_else(|_| Err("failed reading resource name length"))? + as usize; + let resource_data_length = reader + .read_u32::() + .or_else(|_| Err("failed reading resource data length"))? + as usize; + + total_names_length += resource_name_length; + + package_index.push((resource_name_length, resource_data_length)); + } + + index.push((package_name_length, package_index)); + } + + let mut name_offset = reader.position() as usize; + let mut data_offset = name_offset + total_names_length; + let mut res = HashMap::new(); + + for (package_name_length, package_index) in index { + let package_name = unsafe { + std::str::from_utf8_unchecked(&data[name_offset..name_offset + package_name_length]) + }; + + name_offset += package_name_length; + + let mut package_data = Box::new(HashMap::new()); + + for (resource_name_length, resource_data_length) in package_index { + let resource_name = unsafe { + std::str::from_utf8_unchecked( + &data[name_offset..name_offset + resource_name_length], + ) + }; + + name_offset += resource_name_length; + + let resource_data = &data[data_offset..data_offset + resource_data_length]; + + data_offset += resource_data_length; + + package_data.insert(resource_name, resource_data); + } + + res.insert(package_name, Arc::new(package_data)); + } + + Ok(PythonResourcesData { packages: res }) + } +} + +#[allow(unused_doc_comments)] +/// Python type to import modules. +/// +/// This type implements the importlib.abc.MetaPathFinder interface for +/// finding/loading modules. It supports loading various flavors of modules, +/// allowing it to be the only registered sys.meta_path importer. +py_class!(class PyOxidizerFinder |py| { + data imp_module: PyModule; + data marshal_loads: PyObject; + data builtin_importer: PyObject; + data frozen_importer: PyObject; + data call_with_frames_removed: PyObject; + data module_spec_type: PyObject; + data decode_source: PyObject; + data exec_fn: PyObject; + data packages: HashSet<&'static str>; + data known_modules: KnownModules; + data resources: HashMap<&'static str, Arc>>>; + data resource_readers: RefCell>>; + + // Start of importlib.abc.MetaPathFinder interface. + + def find_spec(&self, fullname: &PyString, path: &PyObject, target: Option = None) -> PyResult { + let key = fullname.to_string(py)?; + + if let Some(flavor) = self.known_modules(py).get(&*key) { + match flavor { + KnownModuleFlavor::Builtin => { + // BuiltinImporter.find_spec() always returns None if `path` is defined. + // And it doesn't use `target`. So don't proxy these values. + self.builtin_importer(py).call_method(py, "find_spec", (fullname,), None) + } + KnownModuleFlavor::Frozen => { + self.frozen_importer(py).call_method(py, "find_spec", (fullname, path, target), None) + } + KnownModuleFlavor::InMemory { .. } => { + let is_package = self.packages(py).contains(&*key); + + // TODO consider setting origin and has_location so __file__ will be + // populated. + + let kwargs = PyDict::new(py); + kwargs.set_item(py, "is_package", is_package)?; + + self.module_spec_type(py).call(py, (fullname, self), Some(&kwargs)) + } + } + } else { + Ok(py.None()) + } + } + + def find_module(&self, _fullname: &PyObject, _path: &PyObject) -> PyResult { + // Method is deprecated. Always returns None. + // We /could/ call find_spec(). Meh. + Ok(py.None()) + } + + def invalidate_caches(&self) -> PyResult { + Ok(py.None()) + } + + // End of importlib.abc.MetaPathFinder interface. + + // Start of importlib.abc.Loader interface. + + def create_module(&self, _spec: &PyObject) -> PyResult { + Ok(py.None()) + } + + def exec_module(&self, module: &PyObject) -> PyResult { + let name = module.getattr(py, "__name__")?; + let key = name.extract::(py)?; + + if let Some(flavor) = self.known_modules(py).get(&*key) { + match flavor { + KnownModuleFlavor::Builtin => { + self.builtin_importer(py).call_method(py, "exec_module", (module,), None) + }, + KnownModuleFlavor::Frozen => { + self.frozen_importer(py).call_method(py, "exec_module", (module,), None) + }, + KnownModuleFlavor::InMemory { module_data } => { + match module_data.get_bytecode_memory_view(py) { + Some(value) => { + let code = self.marshal_loads(py).call(py, (value,), None)?; + let exec_fn = self.exec_fn(py); + let dict = module.getattr(py, "__dict__")?; + + self.call_with_frames_removed(py).call(py, (exec_fn, code, dict), None) + }, + None => { + Err(PyErr::new::(py, ("cannot find code in memory", name))) + } + } + }, + } + } else { + // Raising here might make more sense, as exec_module() shouldn't + // be called on the Loader that didn't create the module. + Ok(py.None()) + } + } + + // End of importlib.abc.Loader interface. + + // Start of importlib.abc.InspectLoader interface. + + def get_code(&self, fullname: &PyString) -> PyResult { + let key = fullname.to_string(py)?; + + if let Some(flavor) = self.known_modules(py).get(&*key) { + match flavor { + KnownModuleFlavor::Frozen => { + let imp_module = self.imp_module(py); + + imp_module.call(py, "get_frozen_object", (fullname,), None) + }, + KnownModuleFlavor::InMemory { module_data } => { + match module_data.get_bytecode_memory_view(py) { + Some(value) => { + self.marshal_loads(py).call(py, (value,), None) + } + None => { + Err(PyErr::new::(py, ("cannot find code in memory", fullname))) + } + } + }, + KnownModuleFlavor::Builtin => { + Ok(py.None()) + } + } + } else { + Ok(py.None()) + } + } + + def get_source(&self, fullname: &PyString) -> PyResult { + let key = fullname.to_string(py)?; + + if let Some(flavor) = self.known_modules(py).get(&*key) { + if let KnownModuleFlavor::InMemory { module_data } = flavor { + match module_data.get_source_memory_view(py) { + Some(value) => { + // decode_source (from importlib._bootstrap_external) + // can't handle memoryview. So we take the memory hit and + // cast to bytes. + let b = value.call_method(py, "tobytes", NoArgs, None)?; + self.decode_source(py).call(py, (b,), None) + }, + None => { + Err(PyErr::new::(py, ("source not available", fullname))) + } + } + } else { + Ok(py.None()) + } + } else { + Ok(py.None()) + } + } + + // End of importlib.abc.InspectLoader interface. + + // Support obtaining ResourceReader instances. + def get_resource_reader(&self, fullname: &PyString) -> PyResult { + let key = fullname.to_string(py)?; + + // This should not happen since code below should not be recursive into this + // function. + let mut resource_readers = match self.resource_readers(py).try_borrow_mut() { + Ok(v) => v, + Err(_) => { + return Err(PyErr::new::(py, "resource reader already borrowed")); + } + }; + + // Return an existing instance if we have one. + if let Some(reader) = resource_readers.get(&*key) { + return Ok(reader.clone_ref(py)); + } + + // Only create a reader if the name is a package. + if self.packages(py).contains(&*key) { + + // Not all packages have known resources. + let resources = match self.resources(py).get(&*key) { + Some(v) => v.clone(), + None => { + let h: Box> = Box::new(HashMap::new()); + Arc::new(h) + } + }; + + let reader = PyOxidizerResourceReader::create_instance(py, resources)?.into_object(); + resource_readers.insert(key.to_string(), reader.clone_ref(py)); + + Ok(reader) + } else { + Ok(py.None()) + } + } +}); + +#[allow(unused_doc_comments)] +/// Implements in-memory reading of resource data. +/// +/// Implements importlib.abc.ResourceReader. +py_class!(class PyOxidizerResourceReader |py| { + data resources: Arc>>; + + /// Returns an opened, file-like object for binary reading of the resource. + /// + /// If the resource cannot be found, FileNotFoundError is raised. + def open_resource(&self, resource: &PyString) -> PyResult { + let key = resource.to_string(py)?; + + if let Some(data) = self.resources(py).get(&*key) { + match get_memory_view(py, data) { + Some(mv) => { + let io_module = py.import("io")?; + let bytes_io = io_module.get(py, "BytesIO")?; + + bytes_io.call(py, (mv,), None) + } + None => Err(PyErr::fetch(py)) + } + } else { + Err(PyErr::new::(py, "resource not found")) + } + } + + /// Returns the file system path to the resource. + /// + /// If the resource does not concretely exist on the file system, raise + /// FileNotFoundError. + def resource_path(&self, _resource: &PyString) -> PyResult { + Err(PyErr::new::(py, "in-memory resources do not have filesystem paths")) + } + + /// Returns True if the named name is considered a resource. FileNotFoundError + /// is raised if name does not exist. + def is_resource(&self, name: &PyString) -> PyResult { + let key = name.to_string(py)?; + + if self.resources(py).contains_key(&*key) { + Ok(py.True().as_object().clone_ref(py)) + } else { + Err(PyErr::new::(py, "resource not found")) + } + } + + /// Returns an iterable of strings over the contents of the package. + /// + /// Do note that it is not required that all names returned by the iterator be actual resources, + /// e.g. it is acceptable to return names for which is_resource() would be false. + /// + /// Allowing non-resource names to be returned is to allow for situations where how a package + /// and its resources are stored are known a priori and the non-resource names would be useful. + /// For instance, returning subdirectory names is allowed so that when it is known that the + /// package and resources are stored on the file system then those subdirectory names can be + /// used directly. + def contents(&self) -> PyResult { + let resources = self.resources(py); + let mut names = Vec::with_capacity(resources.len()); + + for name in resources.keys() { + names.push(name.to_py_object(py)); + } + + let names_list = names.to_py_object(py); + + Ok(names_list.as_object().clone_ref(py)) + } +}); + +const DOC: &[u8] = b"Binary representation of Python modules\0"; + +/// Represents global module state to be passed at interpreter initialization time. +#[derive(Debug)] +pub struct InitModuleState { + /// Whether to register the filesystem importer on sys.meta_path. + pub register_filesystem_importer: bool, + + /// Values to set on sys.path. + pub sys_paths: Vec, + + /// Raw data constituting Python module source code. + pub py_modules_data: &'static [u8], + + /// Raw data constituting Python resources data. + pub py_resources_data: &'static [u8], +} + +/// Holds reference to next module state struct. +/// +/// This module state will be copied into the module's state when the +/// Python module is initialized. +pub static mut NEXT_MODULE_STATE: *const InitModuleState = std::ptr::null(); + +/// Represents which importer to use for known modules. +#[derive(Debug)] +enum KnownModuleFlavor { + Builtin, + Frozen, + InMemory { module_data: PythonModuleData }, +} + +type KnownModules = HashMap<&'static str, KnownModuleFlavor>; + +/// State associated with each importer module instance. +/// +/// We write per-module state to per-module instances of this struct so +/// we don't rely on global variables and so multiple importer modules can +/// exist without issue. +#[derive(Debug)] +struct ModuleState { + /// Whether to register PathFinder on sys.meta_path. + register_filesystem_importer: bool, + + /// Values to set on sys.path. + sys_paths: Vec, + + /// Raw data constituting Python module source code. + py_modules_data: &'static [u8], + + /// Raw data constituting Python resources data. + py_resources_data: &'static [u8], + + /// Whether setup() has been called. + setup_called: bool, +} + +/// Obtain the module state for an instance of our importer module. +/// +/// Creates a Python exception on failure. +/// +/// Doesn't do type checking that the PyModule is of the appropriate type. +fn get_module_state<'a>(py: Python, m: &'a PyModule) -> Result<&'a mut ModuleState, PyErr> { + let ptr = m.as_object().as_ptr(); + let state = unsafe { pyffi::PyModule_GetState(ptr) as *mut ModuleState }; + + if state.is_null() { + let err = PyErr::new::(py, "unable to retrieve module state"); + return Err(err); + } + + Ok(unsafe { &mut *state }) +} + +/// Initialize the Python module object. +/// +/// This is called as part of the PyInit_* function to create the internal +/// module object for the interpreter. +/// +/// This receives a handle to the current Python interpreter and just-created +/// Python module instance. It populates the internal module state and registers +/// a _setup() on the module object for usage by Python. +/// +/// Because this function accesses NEXT_MODULE_STATE, it should only be +/// called during interpreter initialization. +fn module_init(py: Python, m: &PyModule) -> PyResult<()> { + let mut state = get_module_state(py, m)?; + + unsafe { + state.register_filesystem_importer = (*NEXT_MODULE_STATE).register_filesystem_importer; + // TODO we could move the value if we wanted to avoid the clone(). + state.sys_paths = (*NEXT_MODULE_STATE).sys_paths.clone(); + state.py_modules_data = (*NEXT_MODULE_STATE).py_modules_data; + state.py_resources_data = (*NEXT_MODULE_STATE).py_resources_data; + } + + state.setup_called = false; + + m.add( + py, + "_setup", + py_fn!( + py, + module_setup( + m: PyModule, + bootstrap_module: PyModule, + marshal_module: PyModule, + decode_source: PyObject + ) + ), + )?; + + Ok(()) +} + +/// Called after module import/initialization to configure the importing mechanism. +/// +/// This does the heavy work of configuring the importing mechanism. +/// +/// This function should only be called once as part of +/// _frozen_importlib_external._install_external_importers(). +fn module_setup( + py: Python, + m: PyModule, + bootstrap_module: PyModule, + marshal_module: PyModule, + decode_source: PyObject, +) -> PyResult { + let state = get_module_state(py, &m)?; + + if state.setup_called { + return Err(PyErr::new::( + py, + "PyOxidizer _setup() already called", + )); + } + + state.setup_called = true; + + let imp_module = bootstrap_module.get(py, "_imp")?; + let imp_module = imp_module.cast_into::(py)?; + let sys_module = bootstrap_module.get(py, "sys")?; + let sys_module = sys_module.cast_as::(py)?; + let meta_path_object = sys_module.get(py, "meta_path")?; + + // We should be executing as part of + // _frozen_importlib_external._install_external_importers(). + // _frozen_importlib._install() should have already been called and set up + // sys.meta_path with [BuiltinImporter, FrozenImporter]. Those should be the + // only meta path importers present. + + let meta_path = meta_path_object.cast_as::(py)?; + + if meta_path.len(py) != 2 { + return Err(PyErr::new::( + py, + "sys.meta_path does not contain 2 values", + )); + } + + let builtin_importer = meta_path.get_item(py, 0); + let frozen_importer = meta_path.get_item(py, 1); + + // It may seem inefficient to create a full HashMap of the parsed data instead of e.g. + // streaming it. But the overhead of iterators was measured to be more than building + // up a temporary HashMap. + let modules_data = match PythonModulesData::from(state.py_modules_data) { + Ok(v) => v, + Err(msg) => return Err(PyErr::new::(py, msg)), + }; + + // Populate our known module lookup table with entries from builtins, frozens, and + // finally us. Last write wins and has the same effect as registering our + // meta path importer first. This should be safe. If nothing else, it allows + // some builtins to be overwritten by .py implemented modules. + let mut known_modules = KnownModules::with_capacity(modules_data.data.len() + 10); + + for i in 0.. { + let record = unsafe { pyffi::PyImport_Inittab.offset(i) }; + + if unsafe { *record }.name.is_null() { + break; + } + + let name = unsafe { CStr::from_ptr((*record).name as _) }; + let name_str = match name.to_str() { + Ok(v) => v, + Err(_) => { + return Err(PyErr::new::( + py, + "unable to parse PyImport_Inittab", + )); + } + }; + + known_modules.insert(name_str, KnownModuleFlavor::Builtin); + } + + for i in 0.. { + let record = unsafe { pyffi::PyImport_FrozenModules.offset(i) }; + + if unsafe { *record }.name.is_null() { + break; + } + + let name = unsafe { CStr::from_ptr((*record).name as _) }; + let name_str = match name.to_str() { + Ok(v) => v, + Err(_) => { + return Err(PyErr::new::( + py, + "unable to parse PyImport_FrozenModules", + )); + } + }; + + known_modules.insert(name_str, KnownModuleFlavor::Frozen); + } + + for (name, record) in modules_data.data { + known_modules.insert( + name, + KnownModuleFlavor::InMemory { + module_data: record, + }, + ); + } + + let resources_data = match PythonResourcesData::from(state.py_resources_data) { + Ok(v) => v, + Err(msg) => return Err(PyErr::new::(py, msg)), + }; + + let marshal_loads = marshal_module.get(py, "loads")?; + let call_with_frames_removed = bootstrap_module.get(py, "_call_with_frames_removed")?; + let module_spec_type = bootstrap_module.get(py, "ModuleSpec")?; + + let builtins_module = + match unsafe { PyObject::from_borrowed_ptr_opt(py, pyffi::PyEval_GetBuiltins()) } { + Some(o) => o.cast_into::(py), + None => { + return Err(PyErr::new::( + py, + "unable to obtain __builtins__", + )); + } + }?; + + let exec_fn = match builtins_module.get_item(py, "exec") { + Some(v) => v, + None => { + return Err(PyErr::new::( + py, + "could not obtain __builtins__.exec", + )); + } + }; + + let resource_readers: RefCell>> = + RefCell::new(Box::new(HashMap::new())); + + let unified_importer = PyOxidizerFinder::create_instance( + py, + imp_module, + marshal_loads, + builtin_importer, + frozen_importer, + call_with_frames_removed, + module_spec_type, + decode_source, + exec_fn, + modules_data.packages, + known_modules, + resources_data.packages, + resource_readers, + )?; + meta_path_object.call_method(py, "clear", NoArgs, None)?; + meta_path_object.call_method(py, "append", (unified_importer,), None)?; + + // At this point the importing mechanism is fully initialized to use our + // unified importer, which handles built-in, frozen, and in-memory imports. + + // Because we're probably running during Py_Initialize() and stdlib modules + // may not be in-memory, we need to register and configure additional importers + // here, before continuing with Py_Initialize(), otherwise we may not find + // the standard library! + + if state.register_filesystem_importer { + // This is what importlib._bootstrap_external usually does: + // supported_loaders = _get_supported_file_loaders() + // sys.path_hooks.extend([FileFinder.path_hook(*supported_loaders)]) + // sys.meta_path.append(PathFinder) + let frozen_importlib_external = py.import("_frozen_importlib_external")?; + + let loaders = + frozen_importlib_external.call(py, "_get_supported_file_loaders", NoArgs, None)?; + let loaders_list = loaders.cast_as::(py)?; + let loaders_vec: Vec = loaders_list.iter(py).collect(); + let loaders_tuple = PyTuple::new(py, loaders_vec.as_slice()); + + let file_finder = frozen_importlib_external.get(py, "FileFinder")?; + let path_hook = file_finder.call_method(py, "path_hook", loaders_tuple, None)?; + let path_hooks = sys_module.get(py, "path_hooks")?; + path_hooks.call_method(py, "append", (path_hook,), None)?; + + let path_finder = frozen_importlib_external.get(py, "PathFinder")?; + let meta_path = sys_module.get(py, "meta_path")?; + meta_path.call_method(py, "append", (path_finder,), None)?; + } + + // Ideally we should be calling Py_SetPath() before Py_Initialize() to set sys.path. + // But we tried to do this and only ran into problems due to string conversions, + // unwanted side-effects. Updating sys.path directly before it is used by PathFinder + // (which was just registered above) should have the same effect. + + // Always clear out sys.path. + let sys_path = sys_module.get(py, "path")?; + sys_path.call_method(py, "clear", NoArgs, None)?; + + // And repopulate it with entries from the config. + for path in &state.sys_paths { + let py_path = PyString::new(py, path.as_str()); + + sys_path.call_method(py, "append", (py_path,), None)?; + } + + Ok(py.None()) +} + +static mut MODULE_DEF: pyffi::PyModuleDef = pyffi::PyModuleDef { + m_base: pyffi::PyModuleDef_HEAD_INIT, + m_name: std::ptr::null(), + m_doc: std::ptr::null(), + m_size: std::mem::size_of::() as isize, + m_methods: 0 as *mut _, + m_slots: 0 as *mut _, + m_traverse: None, + m_clear: None, + m_free: None, +}; + +/// Module initialization function. +/// +/// This creates the Python module object. +/// +/// We don't use the macros in the cpython crate because they are somewhat +/// opinionated about how things should work. e.g. they call +/// PyEval_InitThreads(), which is undesired. We want total control. +#[allow(non_snake_case)] +pub extern "C" fn PyInit__pyoxidizer_importer() -> *mut pyffi::PyObject { + let py = unsafe { cpython::Python::assume_gil_acquired() }; + + // TRACKING RUST1.32 We can't call as_ptr() in const fn in Rust 1.31. + unsafe { + if MODULE_DEF.m_name.is_null() { + MODULE_DEF.m_name = PYOXIDIZER_IMPORTER_NAME.as_ptr() as *const _; + MODULE_DEF.m_doc = DOC.as_ptr() as *const _; + } + } + + let module = unsafe { pyffi::PyModule_Create(&mut MODULE_DEF) }; + + if module.is_null() { + return module; + } + + let module = match unsafe { PyObject::from_owned_ptr(py, module).cast_into::(py) } { + Ok(m) => m, + Err(e) => { + PyErr::from(e).restore(py); + return std::ptr::null_mut(); + } + }; + + match module_init(py, &module) { + Ok(()) => module.into_object().steal_ptr(), + Err(e) => { + e.restore(py); + std::ptr::null_mut() + } + } +} diff --git a/rust/pyembed/src/lib.rs b/rust/pyembed/src/lib.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/lib.rs @@ -0,0 +1,35 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +/*! +Manage an embedded Python interpreter. + +The `pyembed` crate contains functionality for managing a Python interpreter +embedded in the current binary. This crate is typically used along with +[PyOxidizer](https://github.com/indygreg/PyOxidizer) for producing +self-contained binaries containing Python. + +The most important types are [`PythonConfig`](struct.PythonConfig.html) and +[`MainPythonInterpreter`](struct.MainPythonInterpreter.html). A `PythonConfig` +defines how a Python interpreter is to behave. A `MainPythonInterpreter` +creates and manages that interpreter and serves as a high-level interface for +running code in the interpreter. +*/ + +mod config; +mod data; +mod importer; +mod osutils; +mod pyalloc; +mod pyinterp; +mod pystr; + +#[allow(unused_imports)] +pub use crate::config::PythonConfig; + +#[allow(unused_imports)] +pub use crate::data::default_python_config; + +#[allow(unused_imports)] +pub use crate::pyinterp::MainPythonInterpreter; diff --git a/rust/pyembed/src/osutils.rs b/rust/pyembed/src/osutils.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/osutils.rs @@ -0,0 +1,147 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use lazy_static::lazy_static; +use std::path::{Path, PathBuf}; + +/// terminfo directories for Debian based distributions. +/// +/// Search for `--with-terminfo-dirs` at +/// https://salsa.debian.org/debian/ncurses/blob/master/debian/rules to find +/// the source of truth for this. +const TERMINFO_DIRS_DEBIAN: &str = "/etc/terminfo:/lib/terminfo:/usr/share/terminfo"; + +/// terminfo directories for RedHat based distributions. +/// +/// CentOS compiled with +/// `--with-terminfo-dirs=%{_sysconfdir}/terminfo:%{_datadir}/terminfo`. +const TERMINFO_DIRS_REDHAT: &str = "/etc/terminfo:/usr/share/terminfo"; + +/// terminfo directories for macOS. +const TERMINFO_DIRS_MACOS: &str = "/usr/share/terminfo"; + +lazy_static! { + static ref TERMINFO_DIRS_COMMON: Vec = { + vec![ + PathBuf::from("/usr/local/etc/terminfo"), + PathBuf::from("/usr/local/lib/terminfo"), + PathBuf::from("/usr/local/share/terminfo"), + PathBuf::from("/etc/terminfo"), + PathBuf::from("/usr/lib/terminfo"), + PathBuf::from("/lib/terminfo"), + PathBuf::from("/usr/share/terminfo"), + ] + }; +} + +#[derive(Clone)] +enum OsVariant { + Linux, + MacOs, + Windows, + Other, +} + +enum LinuxDistroVariant { + Debian, + RedHat, + Unknown, +} + +lazy_static! { + static ref TARGET_OS: OsVariant = { + if cfg!(target_os = "linux") { + OsVariant::Linux + } else if cfg!(target_os = "macos") { + OsVariant::MacOs + } else if cfg!(target_os = "windows") { + OsVariant::Windows + } else { + OsVariant::Other + } + }; +} + +struct OsInfo { + os: OsVariant, + linux_distro: Option, +} + +fn resolve_linux_distro() -> LinuxDistroVariant { + // Attempt to resolve the Linux distro by parsing /etc files. + let os_release = Path::new("/etc/os-release"); + + if let Ok(data) = std::fs::read_to_string(os_release) { + for line in data.split("\n") { + if line.starts_with("ID_LIKE=") { + if line.contains("debian") { + return LinuxDistroVariant::Debian; + } else if line.contains("rhel") || line.contains("fedora") { + return LinuxDistroVariant::RedHat; + } + } else if line.starts_with("ID=") { + if line.contains("fedora") { + return LinuxDistroVariant::RedHat; + } + } + } + } + + LinuxDistroVariant::Unknown +} + +fn resolve_os_info() -> OsInfo { + let os = TARGET_OS.clone(); + let linux_distro = match os { + OsVariant::Linux => Some(resolve_linux_distro()), + _ => None, + }; + + OsInfo { os, linux_distro } +} + +/// Attempt to resolve the value for the `TERMINFO_DIRS` environment variable. +/// +/// Returns Some() value that `TERMINFO_DIRS` should be set to or None if +/// no environment variable should be set. +pub fn resolve_terminfo_dirs() -> Option { + // Always respect an environment variable, if present. + if std::env::var("TERMINFO_DIRS").is_ok() { + return None; + } + + let os_info = resolve_os_info(); + + match os_info.os { + OsVariant::Linux => match os_info.linux_distro.unwrap() { + // TODO we could stat() the well-known paths ourselves and omit + // paths that don't exist. This /might/ save some syscalls, since + // ncurses doesn't appear to be the most frugal w.r.t. filesystem + // requests. + LinuxDistroVariant::Debian => Some(TERMINFO_DIRS_DEBIAN.to_string()), + LinuxDistroVariant::RedHat => Some(TERMINFO_DIRS_REDHAT.to_string()), + LinuxDistroVariant::Unknown => { + // We don't know this Linux variant. Look for common terminfo + // database directories and use paths that are found. + let paths = TERMINFO_DIRS_COMMON + .iter() + .filter_map(|p| { + if p.exists() { + Some(p.display().to_string()) + } else { + None + } + }) + .collect::>() + .join(":"); + + Some(paths) + } + }, + OsVariant::MacOs => Some(TERMINFO_DIRS_MACOS.to_string()), + // Windows doesn't use the terminfo database. + OsVariant::Windows => None, + OsVariant::Other => None, + } +} diff --git a/rust/pyembed/src/pyalloc.rs b/rust/pyembed/src/pyalloc.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/pyalloc.rs @@ -0,0 +1,221 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Custom Python memory allocators. + +#[cfg(feature = "jemalloc-sys")] +use jemalloc_sys as jemallocffi; +use libc::{c_void, size_t}; +use python3_sys as pyffi; +use std::alloc; +use std::collections::HashMap; +#[cfg(feature = "jemalloc-sys")] +use std::ptr::null_mut; + +const MIN_ALIGN: usize = 16; + +type RawAllocatorState = HashMap<*mut u8, alloc::Layout>; + +/// Holds state for the raw memory allocator. +/// +/// Ideally we wouldn't need to track state. But Rust's dealloc() API +/// requires passing in a Layout that matches the allocation. This means +/// we need to track the Layout for each allocation. This data structure +/// facilitates that. +/// +/// TODO HashMap isn't thread safe and the Python raw allocator doesn't +/// hold the GIL. So we need a thread safe map or a mutex guarding access. +pub struct RawAllocator { + pub allocator: pyffi::PyMemAllocatorEx, + _state: Box, +} + +extern "C" fn raw_rust_malloc(ctx: *mut c_void, size: size_t) -> *mut c_void { + // PyMem_RawMalloc()'s docs say: Requesting zero bytes returns a distinct + // non-NULL pointer if possible, as if PyMem_RawMalloc(1) had been called + // instead. + let size = match size { + 0 => 1, + val => val, + }; + + unsafe { + let state = ctx as *mut RawAllocatorState; + let layout = alloc::Layout::from_size_align_unchecked(size, MIN_ALIGN); + let res = alloc::alloc(layout); + + (*state).insert(res, layout); + + //println!("allocated {} bytes to {:?}", size, res); + res as *mut c_void + } +} + +extern "C" fn raw_rust_calloc(ctx: *mut c_void, nelem: size_t, elsize: size_t) -> *mut c_void { + // PyMem_RawCalloc()'s docs say: Requesting zero elements or elements of + // size zero bytes returns a distinct non-NULL pointer if possible, as if + // PyMem_RawCalloc(1, 1) had been called instead. + let size = match nelem * elsize { + 0 => 1, + val => val, + }; + + unsafe { + let state = ctx as *mut RawAllocatorState; + let layout = alloc::Layout::from_size_align_unchecked(size, MIN_ALIGN); + let res = alloc::alloc_zeroed(layout); + + (*state).insert(res, layout); + + //println!("zero allocated {} bytes to {:?}", size, res); + + res as *mut c_void + } +} + +extern "C" fn raw_rust_realloc( + ctx: *mut c_void, + ptr: *mut c_void, + new_size: size_t, +) -> *mut c_void { + //println!("reallocating {:?} to {} bytes", ptr as *mut u8, new_size); + + // PyMem_RawRealloc()'s docs say: If p is NULL, the call is equivalent to + // PyMem_RawMalloc(n); else if n is equal to zero, the memory block is + // resized but is not freed, and the returned pointer is non-NULL. + if ptr.is_null() { + return raw_rust_malloc(ctx, new_size); + } + + let new_size = match new_size { + 0 => 1, + val => val, + }; + + unsafe { + let state = ctx as *mut RawAllocatorState; + let layout = alloc::Layout::from_size_align_unchecked(new_size, MIN_ALIGN); + + let key = ptr as *mut u8; + let old_layout = (*state) + .remove(&key) + .expect("original memory address not tracked"); + + let res = alloc::realloc(ptr as *mut u8, old_layout, new_size); + + (*state).insert(res, layout); + + res as *mut c_void + } +} + +extern "C" fn raw_rust_free(ctx: *mut c_void, ptr: *mut c_void) { + if ptr.is_null() { + return; + } + + //println!("freeing {:?}", ptr as *mut u8); + unsafe { + let state = ctx as *mut RawAllocatorState; + + let key = ptr as *mut u8; + let layout = (*state) + .get(&key) + .expect(format!("could not find allocated memory record: {:?}", key).as_str()); + + alloc::dealloc(key, *layout); + (*state).remove(&key); + } +} + +pub fn make_raw_rust_memory_allocator() -> RawAllocator { + // We need to allocate the HashMap on the heap so the pointer doesn't refer + // to the stack. We rebox and add the Box to our struct so lifetimes are + // managed. + let alloc = Box::new(HashMap::<*mut u8, alloc::Layout>::new()); + let state = Box::into_raw(alloc); + + let allocator = pyffi::PyMemAllocatorEx { + ctx: state as *mut c_void, + malloc: Some(raw_rust_malloc), + calloc: Some(raw_rust_calloc), + realloc: Some(raw_rust_realloc), + free: Some(raw_rust_free), + }; + + RawAllocator { + allocator, + _state: unsafe { Box::from_raw(state) }, + } +} + +// Now let's define a raw memory allocator that interfaces directly with jemalloc. +// This avoids the overhead of going through Rust's allocation layer. + +#[cfg(feature = "jemalloc-sys")] +extern "C" fn raw_jemalloc_malloc(_ctx: *mut c_void, size: size_t) -> *mut c_void { + // PyMem_RawMalloc()'s docs say: Requesting zero bytes returns a distinct + // non-NULL pointer if possible, as if PyMem_RawMalloc(1) had been called + // instead. + let size = match size { + 0 => 1, + val => val, + }; + + unsafe { jemallocffi::mallocx(size, 0) } +} + +#[cfg(feature = "jemalloc-sys")] +extern "C" fn raw_jemalloc_calloc(_ctx: *mut c_void, nelem: size_t, elsize: size_t) -> *mut c_void { + // PyMem_RawCalloc()'s docs say: Requesting zero elements or elements of + // size zero bytes returns a distinct non-NULL pointer if possible, as if + // PyMem_RawCalloc(1, 1) had been called instead. + let size = match nelem * elsize { + 0 => 1, + val => val, + }; + + unsafe { jemallocffi::mallocx(size, jemallocffi::MALLOCX_ZERO) } +} + +#[cfg(feature = "jemalloc-sys")] +extern "C" fn raw_jemalloc_realloc( + ctx: *mut c_void, + ptr: *mut c_void, + new_size: size_t, +) -> *mut c_void { + // PyMem_RawRealloc()'s docs say: If p is NULL, the call is equivalent to + // PyMem_RawMalloc(n); else if n is equal to zero, the memory block is + // resized but is not freed, and the returned pointer is non-NULL. + if ptr.is_null() { + return raw_jemalloc_malloc(ctx, new_size); + } + + let new_size = match new_size { + 0 => 1, + val => val, + }; + + unsafe { jemallocffi::rallocx(ptr, new_size, 0) } +} + +#[cfg(feature = "jemalloc-sys")] +extern "C" fn raw_jemalloc_free(_ctx: *mut c_void, ptr: *mut c_void) { + if ptr.is_null() { + return; + } + + unsafe { jemallocffi::dallocx(ptr, 0) } +} + +#[cfg(feature = "jemalloc-sys")] +pub fn make_raw_jemalloc_allocator() -> pyffi::PyMemAllocatorEx { + pyffi::PyMemAllocatorEx { + ctx: null_mut(), + malloc: Some(raw_jemalloc_malloc), + calloc: Some(raw_jemalloc_calloc), + realloc: Some(raw_jemalloc_realloc), + free: Some(raw_jemalloc_free), + } +} diff --git a/rust/pyembed/src/pyinterp.rs b/rust/pyembed/src/pyinterp.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/pyinterp.rs @@ -0,0 +1,824 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manage an embedded Python interpreter. + +use libc::c_char; +use python3_sys as pyffi; +use std::collections::BTreeSet; +use std::env; +use std::ffi::CString; +use std::fs; +use std::io::Write; +use std::path::PathBuf; +use std::ptr::null; + +use cpython::exc::ValueError; +use cpython::{ + GILGuard, NoArgs, ObjectProtocol, PyClone, PyDict, PyErr, PyList, PyModule, PyObject, PyResult, + PyString, Python, PythonObject, ToPyObject, +}; + +use super::config::{PythonConfig, PythonRawAllocator, PythonRunMode, TerminfoResolution}; +use super::importer::PyInit__pyoxidizer_importer; +use super::osutils::resolve_terminfo_dirs; +#[cfg(feature = "jemalloc-sys")] +use super::pyalloc::make_raw_jemalloc_allocator; +use super::pyalloc::{make_raw_rust_memory_allocator, RawAllocator}; +use super::pystr::{osstring_to_bytes, osstring_to_str, OwnedPyStr}; + +pub const PYOXIDIZER_IMPORTER_NAME: &[u8] = b"_pyoxidizer_importer\0"; + +const FROZEN_IMPORTLIB_NAME: &[u8] = b"_frozen_importlib\0"; +const FROZEN_IMPORTLIB_EXTERNAL_NAME: &[u8] = b"_frozen_importlib_external\0"; + +/// Represents the results of executing Python code with exception handling. +#[derive(Debug)] +pub enum PythonRunResult { + /// Code executed without raising an exception. + Ok {}, + /// Code executed and raised an exception. + Err {}, + /// Code executed and raised SystemExit with the specified exit code. + Exit { code: i32 }, +} + +fn make_custom_frozen_modules(config: &PythonConfig) -> [pyffi::_frozen; 3] { + [ + pyffi::_frozen { + name: FROZEN_IMPORTLIB_NAME.as_ptr() as *const i8, + code: config.frozen_importlib_data.as_ptr(), + size: config.frozen_importlib_data.len() as i32, + }, + pyffi::_frozen { + name: FROZEN_IMPORTLIB_EXTERNAL_NAME.as_ptr() as *const i8, + code: config.frozen_importlib_external_data.as_ptr(), + size: config.frozen_importlib_external_data.len() as i32, + }, + pyffi::_frozen { + name: null(), + code: null(), + size: 0, + }, + ] +} + +#[cfg(windows)] +extern "C" { + pub fn __acrt_iob_func(x: u32) -> *mut libc::FILE; +} + +#[cfg(windows)] +fn stdin_to_file() -> *mut libc::FILE { + // The stdin symbol is made available by importing . On Windows, + // stdin is defined in corecrt_wstdio.h as a `#define` that calls this + // internal CRT function. There's no exported symbol to use. So we + // emulate the behavior of the C code. + // + // Relying on an internal CRT symbol is probably wrong. But Microsoft + // typically keeps backwards compatibility for undocumented functions + // like this because people use them in the wild. + // + // An attempt was made to use fdopen(0) like we do on POSIX. However, + // this causes a crash. The Microsoft C Runtime is already bending over + // backwards to coerce its native HANDLEs into POSIX file descriptors. + // Even if there are other ways to coerce a FILE* from a HANDLE + // (_open_osfhandle() + _fdopen() might work), using the same function + // that uses to obtain a FILE* seems like the least risky thing + // to do. + unsafe { __acrt_iob_func(0) } +} + +#[cfg(unix)] +fn stdin_to_file() -> *mut libc::FILE { + unsafe { libc::fdopen(libc::STDIN_FILENO, &('r' as libc::c_char)) } +} + +#[cfg(windows)] +fn stderr_to_file() -> *mut libc::FILE { + unsafe { __acrt_iob_func(2) } +} + +#[cfg(unix)] +fn stderr_to_file() -> *mut libc::FILE { + unsafe { libc::fdopen(libc::STDERR_FILENO, &('w' as libc::c_char)) } +} + +#[cfg(feature = "jemalloc-sys")] +fn raw_jemallocator() -> pyffi::PyMemAllocatorEx { + make_raw_jemalloc_allocator() +} + +#[cfg(not(feature = "jemalloc-sys"))] +fn raw_jemallocator() -> pyffi::PyMemAllocatorEx { + panic!("jemalloc is not available in this build configuration"); +} + +/// Manages an embedded Python interpreter. +/// +/// **Warning: Python interpreters have global state. There should only be a +/// single instance of this type per process.** +/// +/// Instances must only be constructed through [`MainPythonInterpreter::new()`](#method.new). +/// +/// This type and its various functionality is a glorified wrapper around the +/// Python C API. But there's a lot of added functionality on top of what the C +/// API provides. +/// +/// Both the low-level `python3-sys` and higher-level `cpython` crates are used. +pub struct MainPythonInterpreter<'a> { + pub config: PythonConfig, + frozen_modules: [pyffi::_frozen; 3], + init_run: bool, + raw_allocator: Option, + raw_rust_allocator: Option, + gil: Option, + py: Option>, + program_name: Option, +} + +impl<'a> MainPythonInterpreter<'a> { + /// Construct a Python interpreter from a configuration. + /// + /// The Python interpreter is initialized as a side-effect. The GIL is held. + pub fn new(config: PythonConfig) -> Result, &'static str> { + match config.terminfo_resolution { + TerminfoResolution::Dynamic => { + if let Some(v) = resolve_terminfo_dirs() { + env::set_var("TERMINFO_DIRS", &v); + } + } + TerminfoResolution::Static(ref v) => { + env::set_var("TERMINFO_DIRS", v); + } + TerminfoResolution::None => {} + } + + let (raw_allocator, raw_rust_allocator) = match config.raw_allocator { + PythonRawAllocator::Jemalloc => (Some(raw_jemallocator()), None), + PythonRawAllocator::Rust => (None, Some(make_raw_rust_memory_allocator())), + PythonRawAllocator::System => (None, None), + }; + + let frozen_modules = make_custom_frozen_modules(&config); + + let mut res = MainPythonInterpreter { + config, + frozen_modules, + init_run: false, + raw_allocator, + raw_rust_allocator, + gil: None, + py: None, + program_name: None, + }; + + res.init()?; + + Ok(res) + } + + /// Initialize the interpreter. + /// + /// This mutates global state in the Python interpreter according to the + /// bound config and initializes the Python interpreter. + /// + /// After this is called, the embedded Python interpreter is ready to + /// execute custom code. + /// + /// If called more than once, the function is a no-op from the perspective + /// of interpreter initialization. + /// + /// Returns a Python instance which has the GIL acquired. + fn init(&mut self) -> Result { + if self.init_run { + return Ok(self.acquire_gil()); + } + + let config = &self.config; + + let exe = env::current_exe().or_else(|_| Err("could not obtain current exe"))?; + let origin = exe + .parent() + .ok_or_else(|| "unable to get exe parent")? + .display() + .to_string(); + + let sys_paths: Vec = config + .sys_paths + .iter() + .map(|path| path.replace("$ORIGIN", &origin)) + .collect(); + + // TODO should we call PyMem::SetupDebugHooks() if enabled? + if let Some(raw_allocator) = &self.raw_allocator { + unsafe { + let ptr = raw_allocator as *const _; + pyffi::PyMem_SetAllocator( + pyffi::PyMemAllocatorDomain::PYMEM_DOMAIN_RAW, + ptr as *mut _, + ); + } + } else if let Some(raw_rust_allocator) = &self.raw_rust_allocator { + unsafe { + let ptr = &raw_rust_allocator.allocator as *const _; + pyffi::PyMem_SetAllocator( + pyffi::PyMemAllocatorDomain::PYMEM_DOMAIN_RAW, + ptr as *mut _, + ); + } + } + + // Module state is a bit wonky. + // + // Our in-memory importer relies on a special module which holds references + // to Python objects exposing module/resource data. This module is imported as + // part of initializing the Python interpreter. + // + // This Python module object needs to hold references to the raw Python module + // and resource data. Those references are defined by the InitModuleState struct. + // + // Unfortunately, we can't easily associate state with the interpreter before + // calling Py_Initialize(). And the module initialization function receives no + // arguments. Our solution is to update a global pointer to point at "our" state + // then call Py_Initialize(). The module will be initialized as part of calling + // Py_Initialize(). It will copy the contents at the pointer into the local + // module state and the global pointer will be unused after that. The end result + // is that we have no reliance on global variables outside of a short window + // between now and when Py_Initialize() is called. + // + // We could potentially do away with this global variable by using a closure for + // the initialization function. But this rabbit hole may involve gross hackery + // like dynamic module names. It probably isn't worth it. + + // It is important for references in this struct to have a lifetime of at least + // that of the interpreter. + // TODO specify lifetimes so the compiler validates this for us. + let module_state = super::importer::InitModuleState { + register_filesystem_importer: self.config.filesystem_importer, + sys_paths, + py_modules_data: config.py_modules_data, + py_resources_data: config.py_resources_data, + }; + + if config.use_custom_importlib { + // Replace the frozen modules in the interpreter with our custom set + // that knows how to import from memory. + unsafe { + pyffi::PyImport_FrozenModules = self.frozen_modules.as_ptr(); + } + + // Register our _pyoxidizer_importer extension which provides importing functionality. + unsafe { + // name char* needs to live as long as the interpreter is active. + pyffi::PyImport_AppendInittab( + PYOXIDIZER_IMPORTER_NAME.as_ptr() as *const i8, + Some(PyInit__pyoxidizer_importer), + ); + + // Move pointer to our stack allocated instance. This pointer will be + // accessed when creating the Python module object, which should be + // done automatically as part of low-level interpreter initialization + // when calling Py_Initialize() below. + super::importer::NEXT_MODULE_STATE = &module_state; + } + } + + // TODO call PyImport_ExtendInitTab to avoid O(n) overhead. + for e in &config.extra_extension_modules { + let res = unsafe { + pyffi::PyImport_AppendInittab(e.name.as_ptr() as *const i8, Some(e.init_func)) + }; + + if res != 0 { + return Err("unable to register extension module"); + } + } + + let exe_str = exe.to_str().ok_or_else(|| "unable to convert exe to str")?; + + let home = OwnedPyStr::from_str(exe_str)?; + + unsafe { + // Pointer needs to live for lifetime of interpreter. + pyffi::Py_SetPythonHome(home.as_wchar_ptr()); + } + + let program_name = OwnedPyStr::from_str(exe_str)?; + + unsafe { + pyffi::Py_SetProgramName(program_name.as_wchar_ptr()); + } + + // Value needs to live for lifetime of interpreter. + self.program_name = Some(program_name); + + // If we don't call Py_SetPath(), Python has its own logic for initializing it. + // We set it to an empty string because we don't want any paths by default. If + // we do have defined paths, they will be set after Py_Initialize(). + unsafe { + // Value is copied internally. So short lifetime is OK. + let value = OwnedPyStr::from_str("")?; + pyffi::Py_SetPath(value.as_wchar_ptr()); + } + + if let (Some(ref encoding), Some(ref errors)) = + (&config.standard_io_encoding, &config.standard_io_errors) + { + let cencoding = CString::new(encoding.clone()) + .or_else(|_| Err("unable to convert encoding to C string"))?; + let cerrors = CString::new(errors.clone()) + .or_else(|_| Err("unable to convert encoding error mode to C string"))?; + + let res = unsafe { + pyffi::Py_SetStandardStreamEncoding( + cencoding.as_ptr() as *const i8, + cerrors.as_ptr() as *const i8, + ) + }; + + if res != 0 { + return Err("unable to set standard stream encoding"); + } + } + + unsafe { + pyffi::Py_DontWriteBytecodeFlag = if config.dont_write_bytecode { 1 } else { 0 }; + pyffi::Py_IgnoreEnvironmentFlag = if config.ignore_python_env { 1 } else { 0 }; + pyffi::Py_NoSiteFlag = if config.import_site { 0 } else { 1 }; + pyffi::Py_NoUserSiteDirectory = if config.import_user_site { 0 } else { 1 }; + pyffi::Py_OptimizeFlag = config.opt_level; + pyffi::Py_UnbufferedStdioFlag = if config.unbuffered_stdio { 1 } else { 0 }; + } + + /* Pre-initialization functions we could support: + * + * PyObject_SetArenaAllocator() + * PySys_AddWarnOption() + * PySys_AddXOption() + * PySys_ResetWarnOptions() + */ + + unsafe { + pyffi::Py_Initialize(); + } + + // We shouldn't be accessing this pointer after Py_Initialize(). And the + // memory is stack allocated and doesn't outlive this frame. We don't want + // to leave a stack pointer sitting around! + unsafe { + super::importer::NEXT_MODULE_STATE = std::ptr::null(); + } + + let py = unsafe { Python::assume_gil_acquired() }; + self.py = Some(py); + self.init_run = true; + + // env::args() panics if arguments aren't valid Unicode. But invalid + // Unicode arguments are possible and some applications may want to + // support them. + // + // env::args_os() provides access to the raw OsString instances, which + // will be derived from wchar_t on Windows and char* on POSIX. We can + // convert these to Python str instances using a platform-specific + // mechanism. + let args_objs = env::args_os() + .map(|os_arg| osstring_to_str(py, os_arg)) + .collect::, &'static str>>()?; + + // This will steal the pointer to the elements and mem::forget them. + let args = PyList::new(py, &args_objs); + let argv = b"argv\0"; + + let res = args.with_borrowed_ptr(py, |args_ptr| unsafe { + pyffi::PySys_SetObject(argv.as_ptr() as *const i8, args_ptr) + }); + + match res { + 0 => (), + _ => return Err("unable to set sys.argv"), + } + + if config.argvb { + let args_objs: Vec = env::args_os() + .map(|os_arg| osstring_to_bytes(py, os_arg)) + .collect(); + + let args = PyList::new(py, &args_objs); + let argvb = b"argvb\0"; + + let res = args.with_borrowed_ptr(py, |args_ptr| unsafe { + pyffi::PySys_SetObject(argvb.as_ptr() as *const i8, args_ptr) + }); + + match res { + 0 => (), + _ => return Err("unable to set sys.argvb"), + } + } + + // As a convention, sys.oxidized is set to indicate we are running from + // a self-contained application. + let oxidized = b"oxidized\0"; + + let res = py.True().with_borrowed_ptr(py, |py_true| unsafe { + pyffi::PySys_SetObject(oxidized.as_ptr() as *const i8, py_true) + }); + + match res { + 0 => (), + _ => return Err("unable to set sys.oxidized"), + } + + if config.sys_frozen { + let frozen = b"frozen\0"; + + match py.True().with_borrowed_ptr(py, |py_true| unsafe { + pyffi::PySys_SetObject(frozen.as_ptr() as *const i8, py_true) + }) { + 0 => (), + _ => return Err("unable to set sys.frozen"), + } + } + + if config.sys_meipass { + let meipass = b"_MEIPASS\0"; + let value = PyString::new(py, &origin); + + match value.with_borrowed_ptr(py, |py_value| unsafe { + pyffi::PySys_SetObject(meipass.as_ptr() as *const i8, py_value) + }) { + 0 => (), + _ => return Err("unable to set sys._MEIPASS"), + } + } + + Ok(py) + } + + /// Ensure the Python GIL is released. + pub fn release_gil(&mut self) { + if self.py.is_some() { + self.py = None; + self.gil = None; + } + } + + /// Ensure the Python GIL is acquired, returning a handle on the interpreter. + pub fn acquire_gil(&mut self) -> Python<'a> { + match self.py { + Some(py) => py, + None => { + let gil = GILGuard::acquire(); + let py = unsafe { Python::assume_gil_acquired() }; + + self.gil = Some(gil); + self.py = Some(py); + + py + } + } + } + + /// Runs the interpreter with the default code execution settings. + /// + /// The crate was built with settings that configure what should be + /// executed by default. Those settings will be loaded and executed. + pub fn run(&mut self) -> PyResult { + // clone() to avoid issues mixing mutable and immutable borrows of self. + let run = self.config.run.clone(); + + let py = self.acquire_gil(); + + match run { + PythonRunMode::None => Ok(py.None()), + PythonRunMode::Repl => self.run_repl(), + PythonRunMode::Module { module } => self.run_module_as_main(&module), + PythonRunMode::Eval { code } => self.run_code(&code), + } + } + + /// Handle a raised SystemExit exception. + /// + /// This emulates the behavior in pythonrun.c:handle_system_exit() and + /// _Py_HandleSystemExit() but without the call to exit(), which we don't want. + fn handle_system_exit(&mut self, py: Python, err: PyErr) -> Result { + std::io::stdout() + .flush() + .or_else(|_| Err("failed to flush stdout"))?; + + let mut value = match err.pvalue { + Some(ref instance) => { + if instance.as_ptr() == py.None().as_ptr() { + return Ok(0); + } + + instance.clone_ref(py) + } + None => { + return Ok(0); + } + }; + + if unsafe { pyffi::PyExceptionInstance_Check(value.as_ptr()) } != 0 { + // The error code should be in the "code" attribute. + if let Ok(code) = value.getattr(py, "code") { + if code == py.None() { + return Ok(0); + } + + // Else pretend exc_value.code is the new exception value to use + // and fall through to below. + value = code; + } + } + + if unsafe { pyffi::PyLong_Check(value.as_ptr()) } != 0 { + return Ok(unsafe { pyffi::PyLong_AsLong(value.as_ptr()) as i32 }); + } + + let sys_module = py + .import("sys") + .or_else(|_| Err("unable to obtain sys module"))?; + let stderr = sys_module.get(py, "stderr"); + + // This is a cargo cult from the canonical implementation. + unsafe { pyffi::PyErr_Clear() } + + match stderr { + Ok(o) => unsafe { + pyffi::PyFile_WriteObject(value.as_ptr(), o.as_ptr(), pyffi::Py_PRINT_RAW); + }, + Err(_) => { + unsafe { + pyffi::PyObject_Print(value.as_ptr(), stderr_to_file(), pyffi::Py_PRINT_RAW); + } + std::io::stderr() + .flush() + .or_else(|_| Err("failure to flush stderr"))?; + } + } + + unsafe { + pyffi::PySys_WriteStderr(b"\n\0".as_ptr() as *const i8); + } + + // This frees references to this exception, which may be necessary to avoid + // badness. + err.restore(py); + unsafe { + pyffi::PyErr_Clear(); + } + + Ok(1) + } + + /// Runs the interpreter and handles any exception that was raised. + pub fn run_and_handle_error(&mut self) -> PythonRunResult { + // There are underdefined lifetime bugs at play here. There is no + // explicit lifetime for the PyObject's returned. If we don't have + // the local variable in scope, we can get into a situation where + // drop() on self is called before the PyObject's drop(). This is + // problematic because PyObject's drop() attempts to acquire the GIL. + // If the interpreter is shut down, there is no GIL to acquire, and + // we may segfault. + // TODO look into setting lifetimes properly so the compiler can + // prevent some issues. + let res = self.run(); + let py = self.acquire_gil(); + + match res { + Ok(_) => PythonRunResult::Ok {}, + Err(err) => { + // SystemExit is special in that PyErr_PrintEx() will call + // exit() if it is seen. So, we handle it manually so we can + // return an exit code instead of exiting. + + // TODO surely the cpython crate offers a better way to do this... + err.restore(py); + let matches = + unsafe { pyffi::PyErr_ExceptionMatches(pyffi::PyExc_SystemExit) } != 0; + let err = cpython::PyErr::fetch(py); + + if matches { + return PythonRunResult::Exit { + code: match self.handle_system_exit(py, err) { + Ok(code) => code, + Err(msg) => { + eprintln!("{}", msg); + 1 + } + }, + }; + } + + self.print_err(err); + + PythonRunResult::Err {} + } + } + } + + /// Calls run() and resolves a suitable exit code. + pub fn run_as_main(&mut self) -> i32 { + match self.run_and_handle_error() { + PythonRunResult::Ok {} => 0, + PythonRunResult::Err {} => 1, + PythonRunResult::Exit { code } => code, + } + } + + /// Runs a Python module as the __main__ module. + /// + /// Returns the execution result of the module code. + /// + /// The interpreter is automatically initialized if needed. + pub fn run_module_as_main(&mut self, name: &str) -> PyResult { + let py = self.acquire_gil(); + + // This is modeled after runpy.py:_run_module_as_main(). + let main: PyModule = unsafe { + PyObject::from_borrowed_ptr( + py, + pyffi::PyImport_AddModule("__main__\0".as_ptr() as *const c_char), + ) + .cast_into(py)? + }; + + let main_dict = main.dict(py); + + let importlib_util = py.import("importlib.util")?; + let spec = importlib_util.call(py, "find_spec", (name,), None)?; + let loader = spec.getattr(py, "loader")?; + let code = loader.call_method(py, "get_code", (name,), None)?; + + let origin = spec.getattr(py, "origin")?; + let cached = spec.getattr(py, "cached")?; + + // TODO handle __package__. + main_dict.set_item(py, "__name__", "__main__")?; + main_dict.set_item(py, "__file__", origin)?; + main_dict.set_item(py, "__cached__", cached)?; + main_dict.set_item(py, "__doc__", py.None())?; + main_dict.set_item(py, "__loader__", loader)?; + main_dict.set_item(py, "__spec__", spec)?; + + unsafe { + let globals = main_dict.as_object().as_ptr(); + let res = pyffi::PyEval_EvalCode(code.as_ptr(), globals, globals); + + if res.is_null() { + let err = PyErr::fetch(py); + err.print(py); + Err(PyErr::fetch(py)) + } else { + Ok(PyObject::from_owned_ptr(py, res)) + } + } + } + + /// Start and run a Python REPL. + /// + /// This emulates what CPython's main.c does. + /// + /// The interpreter is automatically initialized if needed. + pub fn run_repl(&mut self) -> PyResult { + let py = self.acquire_gil(); + + unsafe { + pyffi::Py_InspectFlag = 0; + } + + // readline is optional. We don't care if it fails. + if py.import("readline").is_ok() {} + + let sys = py.import("sys")?; + + if let Ok(hook) = sys.get(py, "__interactivehook__") { + hook.call(py, NoArgs, None)?; + } + + let stdin_filename = ""; + let filename = CString::new(stdin_filename) + .or_else(|_| Err(PyErr::new::(py, "could not create CString")))?; + let mut cf = pyffi::PyCompilerFlags { cf_flags: 0 }; + + // TODO use return value. + unsafe { + let stdin = stdin_to_file(); + pyffi::PyRun_AnyFileExFlags(stdin, filename.as_ptr() as *const c_char, 0, &mut cf) + }; + + Ok(py.None()) + } + + /// Runs Python code provided by a string. + /// + /// This is similar to what ``python -c `` would do. + /// + /// The interpreter is automatically initialized if needed. + pub fn run_code(&mut self, code: &str) -> PyResult { + let py = self.acquire_gil(); + + let code = CString::new(code).or_else(|_| { + Err(PyErr::new::( + py, + "source code is not a valid C string", + )) + })?; + + unsafe { + let main = pyffi::PyImport_AddModule("__main__\0".as_ptr() as *const _); + + if main.is_null() { + return Err(PyErr::fetch(py)); + } + + let main_dict = pyffi::PyModule_GetDict(main); + + let res = pyffi::PyRun_StringFlags( + code.as_ptr() as *const _, + pyffi::Py_file_input, + main_dict, + main_dict, + std::ptr::null_mut(), + ); + + if res.is_null() { + Err(PyErr::fetch(py)) + } else { + Ok(PyObject::from_owned_ptr(py, res)) + } + } + } + + /// Print a Python error. + /// + /// Under the hood this calls ``PyErr_PrintEx()``, which may call + /// ``Py_Exit()`` and may write to stderr. + pub fn print_err(&mut self, err: PyErr) { + let py = self.acquire_gil(); + err.print(py); + } +} + +/// Write loaded Python modules to a directory. +/// +/// Given a Python interpreter and a path to a directory, this will create a +/// file in that directory named ``modules-`` and write a ``\n`` delimited +/// list of loaded names from ``sys.modules`` into that file. +fn write_modules_to_directory(py: Python, path: &PathBuf) -> Result<(), &'static str> { + // TODO this needs better error handling all over. + + fs::create_dir_all(path).or_else(|_| Err("could not create directory for modules"))?; + + let rand = uuid::Uuid::new_v4(); + + let path = path.join(format!("modules-{}", rand.to_string())); + + let sys = py + .import("sys") + .or_else(|_| Err("could not obtain sys module"))?; + let modules = sys + .get(py, "modules") + .or_else(|_| Err("could not obtain sys.modules"))?; + + let modules = modules + .cast_as::(py) + .or_else(|_| Err("sys.modules is not a dict"))?; + + let mut names = BTreeSet::new(); + for (key, _value) in modules.items(py) { + names.insert( + key.extract::(py) + .or_else(|_| Err("module name is not a str"))?, + ); + } + + let mut f = fs::File::create(path).or_else(|_| Err("could not open file for writing"))?; + + for name in names { + f.write_fmt(format_args!("{}\n", name)) + .or_else(|_| Err("could not write"))?; + } + + Ok(()) +} + +impl<'a> Drop for MainPythonInterpreter<'a> { + fn drop(&mut self) { + if let Some(key) = &self.config.write_modules_directory_env { + if let Ok(path) = env::var(key) { + let path = PathBuf::from(path); + let py = self.acquire_gil(); + + if let Err(msg) = write_modules_to_directory(py, &path) { + eprintln!("error writing modules file: {}", msg); + } + } + } + + let _ = unsafe { pyffi::Py_FinalizeEx() }; + } +} diff --git a/rust/pyembed/src/pystr.rs b/rust/pyembed/src/pystr.rs new file mode 100644 --- /dev/null +++ b/rust/pyembed/src/pystr.rs @@ -0,0 +1,98 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Bridge Rust and Python string types. + +use libc::{c_void, size_t, wchar_t}; +use python3_sys as pyffi; +use std::ffi::{CString, OsString}; +use std::ptr::null_mut; + +#[cfg(target_family = "unix")] +use std::os::unix::ffi::OsStrExt; +#[cfg(target_family = "windows")] +use std::os::windows::prelude::OsStrExt; + +use cpython::{PyObject, Python}; + +#[derive(Debug)] +pub struct OwnedPyStr { + data: *const wchar_t, +} + +impl OwnedPyStr { + pub fn as_wchar_ptr(&self) -> *const wchar_t { + self.data + } + + pub fn from_str(s: &str) -> Result { + // We need to convert to a C string so there is a terminal NULL + // otherwise Py_DecodeLocale() can get confused. + let cs = CString::new(s).or_else(|_| Err("source string has NULL bytes"))?; + + let size: *mut size_t = null_mut(); + let ptr = unsafe { pyffi::Py_DecodeLocale(cs.as_ptr(), size) }; + + if ptr.is_null() { + Err("could not convert str to Python string") + } else { + Ok(OwnedPyStr { data: ptr }) + } + } +} + +impl Drop for OwnedPyStr { + fn drop(&mut self) { + unsafe { pyffi::PyMem_RawFree(self.data as *mut c_void) } + } +} + +#[cfg(target_family = "unix")] +const SURROGATEESCAPE: &[u8] = b"surrogateescape\0"; + +#[cfg(target_family = "unix")] +pub fn osstring_to_str(py: Python, s: OsString) -> Result { + // PyUnicode_DecodeLocaleAndSize says the input must have a trailing NULL. + // So use a CString for that. + let b = CString::new(s.as_bytes()).or_else(|_| Err("not a valid C string"))?; + unsafe { + let o = pyffi::PyUnicode_DecodeLocaleAndSize( + b.as_ptr() as *const i8, + b.to_bytes().len() as isize, + SURROGATEESCAPE.as_ptr() as *const i8, + ); + + Ok(PyObject::from_owned_ptr(py, o)) + } +} + +#[cfg(target_family = "windows")] +pub fn osstring_to_str(py: Python, s: OsString) -> Result { + // Windows OsString should be valid UTF-16. + let w: Vec = s.encode_wide().collect(); + unsafe { + Ok(PyObject::from_owned_ptr( + py, + pyffi::PyUnicode_FromWideChar(w.as_ptr(), w.len() as isize), + )) + } +} + +#[cfg(target_family = "unix")] +pub fn osstring_to_bytes(py: Python, s: OsString) -> PyObject { + let b = s.as_bytes(); + unsafe { + let o = pyffi::PyBytes_FromStringAndSize(b.as_ptr() as *const i8, b.len() as isize); + PyObject::from_owned_ptr(py, o) + } +} + +#[cfg(target_family = "windows")] +pub fn osstring_to_bytes(py: Python, s: OsString) -> PyObject { + let w: Vec = s.encode_wide().collect(); + unsafe { + let o = pyffi::PyBytes_FromStringAndSize(w.as_ptr() as *const i8, w.len() as isize * 2); + PyObject::from_owned_ptr(py, o) + } +}