Source code for SciKit.utils

"""
Analysis utilities for molecular dynamics simulation post-processing.

Provides functions for statistical analysis, histogram/distribution conversion,
PCA-based free-energy landscapes, block averaging, and data loading helpers
built on top of NumPy, SciPy, and MDAnalysis.
"""

import numpy as np
import MDAnalysis as mda


[docs] def scatter2hist(point_list, num_bin, styles): """ Convert a 1-D array of scatter points to a histogram or smooth distribution. Computes a normalized histogram from raw data and returns either a stepped distribution (suitable for ``ax.plot`` step-style rendering) or a cubic/quadratic spline-smoothed curve, depending on ``styles``. Zero-valued padding is appended to both ends so the distribution reaches zero at the boundaries. Parameters ---------- point_list : array_like 1-D sequence of scalar data points to histogram. num_bin : int Number of histogram bins. For smoothed styles, the output x-grid is upsampled to ``num_bin * 10`` points. styles : {'pdf', 'pmf', 's_pdf', 's_pmf'} Output type: - ``'pdf'`` : Stepped probability density function (area under curve ≈ 1). - ``'pmf'`` : Stepped probability mass function (sum of values ≈ 1). - ``'s_pdf'``: Cubic spline-smoothed PDF curve. - ``'s_pmf'``: Quadratic spline-smoothed PMF curve. Returns ------- x : numpy.ndarray Bin-center positions (or smoothed x-grid for ``'s_pdf'``/``'s_pmf'``), with one extra zero-padded point prepended and appended. y : numpy.ndarray Corresponding PDF or PMF values, zero-padded at both ends. Raises ------ ValueError If ``styles`` is not one of the four accepted strings. Notes ----- The smoothed variants use ``scipy.interpolate.make_interp_spline`` with spline order ``k=3`` (PDF) or ``k=2`` (PMF). Negative spline artefacts near the zero-padded boundaries are possible for small ``num_bin`` values. Examples -------- >>> x, y = scatter2hist(distances, num_bin=50, styles='pdf') >>> ax.plot(x, y) >>> x_smooth, y_smooth = scatter2hist(distances, num_bin=50, styles='s_pdf') >>> ax.plot(x_smooth, y_smooth) """ from scipy.interpolate import make_interp_spline counts, bins = np.histogram(point_list, num_bin) pmf = counts / counts.sum() pdf = pmf / np.diff(bins) # Use bin centers instead of edges bin_centers = 0.5 * (bins[1:] + bins[:-1]) bin_width = np.diff(bins)[0] # Pad edges for smooth or full histogram plotting def pad_edges(x, y): x = np.insert(x, 0, x[0] - bin_width) x = np.append(x, x[-1] + bin_width) y = np.insert(y, 0, 0) y = np.append(y, 0) return x, y if styles == 'pdf': bin_centers, pdf = pad_edges(bin_centers, pdf) return bin_centers, pdf elif styles == 'pmf': bin_centers, pmf = pad_edges(bin_centers, pmf) return bin_centers, pmf elif styles == 's_pdf': bin_centers, pdf = pad_edges(bin_centers, pdf) bins_smooth = np.linspace(bin_centers.min(), bin_centers.max(), num_bin * 10) spline = make_interp_spline(bin_centers, pdf, k=3) pdf_smooth = spline(bins_smooth) return bins_smooth, pdf_smooth elif styles == 's_pmf': bin_centers, pmf = pad_edges(bin_centers, pmf) bins_smooth = np.linspace(bin_centers.min(), bin_centers.max(), num_bin * 10) spline = make_interp_spline(bin_centers, pmf, k=2) pmf_smooth = spline(bins_smooth) return bins_smooth, pmf_smooth else: raise ValueError("styles must be one of ['pdf', 'pmf', 's_pdf', 's_pmf']")
[docs] def mda_pca(psf, dcd, sel, align=True): """ Perform PCA on an MD trajectory and return the first two principal components. Loads a CHARMM PSF + DCD trajectory pair, selects atoms, and runs MDAnalysis PCA. Returns the per-frame projections onto PC1 and PC2 along with their individual explained-variance fractions. Parameters ---------- psf : str Path to the CHARMM PSF topology file. dcd : str Path to the DCD trajectory file. sel : str MDAnalysis atom-selection string applied to both alignment and PCA (e.g., ``'backbone'``, ``'name CA'``). align : bool, optional If ``True`` (default), align all frames to the first frame before computing the covariance matrix. Returns ------- pc1 : numpy.ndarray, shape (n_frames,) Per-frame projection onto the first principal component. pc2 : numpy.ndarray, shape (n_frames,) Per-frame projection onto the second principal component. var1 : float Explained variance fraction of PC1 (value in [0, 1]). var2 : float Explained variance fraction of PC2 (value in [0, 1]). Notes ----- ``var1`` is taken directly from ``pc.cumulated_variance[0]``. ``var2`` is the *incremental* variance of PC2, computed as ``cumulated_variance[1] - cumulated_variance[0]``. Multiply by 100 to obtain a percentage. Examples -------- >>> pc1, pc2, var1, var2 = mda_pca('system.psf', 'traj.dcd', 'backbone') >>> print(f'PC1: {var1*100:.1f}% PC2: {var2*100:.1f}%') """ from MDAnalysis.analysis import pca u = mda.Universe(psf, dcd) atomgroup = u.select_atoms(sel) pc = pca.PCA(u, select=sel, align=align).run() pc1, pc2 = pc.transform(atomgroup)[:, 0], pc.transform(atomgroup)[:, 1] var1, var2 = pc.cumulated_variance[0], pc.cumulated_variance[1] - pc.cumulated_variance[0] return pc1, pc2, var1, var2
[docs] def pca2fe(pc1, pc2, num_bin=100): """ Convert 2-D PCA projections to a free-energy landscape in units of k\ :sub:`B`\ T. Bins ``(pc1, pc2)`` pairs into a 2-D histogram, normalises to a PMF, and applies the Boltzmann inversion ``F = -ln(P)``. The global minimum of the finite free-energy values is shifted to zero. Parameters ---------- pc1 : array_like, shape (n_frames,) Per-frame projection onto the first principal component. pc2 : array_like, shape (n_frames,) Per-frame projection onto the second principal component. num_bin : int, optional Number of bins along each axis of the 2-D histogram (default ``100``). Returns ------- fe : numpy.ndarray, shape (num_bin, num_bin) Free-energy surface in units of k\ :sub:`B`\ T, with the global minimum set to 0. Bins with zero occupancy contain ``inf``. Notes ----- The local minimum (highest-occupancy bin) coordinates in data space are computed internally but not currently returned. Retrieve them by inspecting ``xedges`` / ``yedges`` if needed in future extensions. Examples -------- >>> fe = pca2fe(pc1, pc2, num_bin=80) >>> plt.imshow(fe.T, origin='lower') """ H, xedges, yedges = np.histogram2d(pc1, pc2, bins=num_bin) pmf = H / np.sum(H) # convert pmf to free energy fe = -np.log(pmf) fe_min = np.min(fe[np.isfinite(fe)]) fe -= fe_min # find the local minimum local_minimum = np.unravel_index(np.argmax(H), H.shape) localx, localy = xedges[local_minimum[0]], yedges[local_minimum[1]] return fe
[docs] def plt_pca(axs, fe, var1=None, var2=None, cmap='viridis', contour=False, levels=5): """ Plot a 2-D free-energy landscape from PCA projections. Displays the free-energy array as a colour-mapped image with an attached colorbar. Axis tick marks are suppressed so that the plot reflects relative rather than absolute PC coordinates. Optionally overlays iso-energy contour lines. Parameters ---------- axs : matplotlib.axes.Axes Target axes on which the landscape will be drawn. fe : numpy.ndarray, shape (N, N) Free-energy surface in units of k\ :sub:`B`\ T, as returned by :func:`pca2fe`. Transposed internally before display. var1 : float or None, optional Explained variance fraction of PC1 (value in [0, 1]). If provided together with ``var2``, percentage labels are added to the axis. Default ``None`` omits the percentage. var2 : float or None, optional Explained variance fraction of PC2 (value in [0, 1]). Default ``None`` omits the percentage. cmap : str or matplotlib.colors.Colormap, optional Colormap for the free-energy surface (default ``'viridis'``). contour : bool, optional If ``True``, overlay black iso-energy contour lines (default ``False``). levels : int, optional Number of contour levels drawn when ``contour=True`` (default ``5``). Examples -------- >>> fig, axs = general_temp(1, 1, 6, 5) >>> fe = pca2fe(pc1, pc2) >>> plt_pca(axs, fe, var1=0.35, var2=0.12, contour=True, levels=8) """ im = axs.imshow(fe.T, origin='lower', cmap=cmap) axs.figure.colorbar(im, ax=axs, label='Free energy (k$_B$T)', fraction=0.046) if var1 is None: axs.set(xlabel='PC1', xticks=[], ylabel='PC2', yticks=[]) else: axs.set(xlabel=f'PC1 ({var1*100:.2f}%)', xticks=[], ylabel=f'PC2 ({var2*100:.2f}%)', yticks=[]) if contour: axs.contour(fe.T, levels=levels, colors='k')
[docs] def pca2d(axs, psf, dcd, sel, num_bin=100, align=True, cmap='viridis'): """ Compute and plot a 2-D PCA free-energy landscape in a single call. Convenience wrapper that combines :func:`mda_pca`, :func:`pca2fe`, and :func:`plt_pca` into one step. Loads the trajectory, runs PCA, converts to a free-energy surface, and renders the result directly onto ``axs``. Prefer this function for quick exploratory plots; use the individual functions when you need intermediate results (e.g., ``pc1``/``pc2`` for further analysis). Parameters ---------- axs : matplotlib.axes.Axes Target axes on which the landscape will be drawn. psf : str Path to the CHARMM PSF topology file. dcd : str Path to the DCD trajectory file. sel : str MDAnalysis atom-selection string (e.g., ``'backbone'``). num_bin : int, optional Number of bins along each PCA axis (default ``100``). align : bool, optional If ``True`` (default), align frames before PCA. cmap : str or matplotlib.colors.Colormap, optional Colormap for the free-energy surface (default ``'viridis'``). Examples -------- >>> fig, axs = general_temp(1, 1, 6, 5) >>> pca2d(axs, 'system.psf', 'traj.dcd', 'backbone', num_bin=80) """ from MDAnalysis.analysis import pca u = mda.Universe(psf, dcd) atomgroup = u.select_atoms(sel) pc = pca.PCA(u, select=sel, align=align).run() pc1, pc2 = pc.transform(atomgroup)[:, 0], pc.transform(atomgroup)[:, 1] var1, var2 = pc.cumulated_variance[0], pc.cumulated_variance[1] - pc.cumulated_variance[0] H, xedges, yedges = np.histogram2d(pc1, pc2, bins=num_bin) pmf = H / np.sum(H) # convert pmf to free energy fe = -np.log(pmf) fe_min = np.min(fe[np.isfinite(fe)]) fe -= fe_min im = axs.imshow(fe.T, origin='lower', cmap=cmap) axs.figure.colorbar(im, ax=axs, label='Free energy (k$_B$T)', fraction=0.046) axs.set(xlabel=f'PC1 ({var1*100:.2f}%)', xticks=[], ylabel=f'PC2 ({var2*100:.2f}%)', yticks=[])
[docs] def block_mean(data, division): """ Estimate the mean and uncertainty of a time series using two-block averaging. Discards an initial fraction of the data as equilibration, then splits the remaining production region into two equal blocks. The reported average is the mean of the two block means and the error is their standard deviation — a simple variance estimator that accounts for serial correlation in MD observables. Parameters ---------- data : array_like 1-D sequence of scalar values ordered along simulation time. division : int Controls the equilibration cutoff: the first ``1/division`` of the data is discarded. The remaining ``(1 - 1/division)`` fraction is split into two equal blocks for the error estimate. For example, ``division=5`` discards the first 20 % of frames. Returns ------- average : float Mean of the two block averages. error : float Standard deviation of the two block averages (half the difference between them), representing the statistical uncertainty. Examples -------- >>> avg, err = block_mean(rg_timeseries, division=5) >>> print(f'Rg = {avg:.2f} ± {err:.2f} Å') """ l = len(data) start_frame = int(l / division) half = start_frame + int((l - start_frame) / 2) part1 = np.mean(data[start_frame:half]) part2 = np.mean(data[half:]) average = np.mean([part1, part2]) error = np.std([part1, part2]) return average, error
[docs] def rms(data): """ Compute the root-mean-square (RMS) value of an array. Parameters ---------- data : array_like 1-D or N-D array of numeric values. Returns ------- float RMS value: ``sqrt(mean(data**2))``. Examples -------- >>> rms([1, -2, 3, -4]) 2.7386... """ data = np.asarray(data) return np.sqrt(np.mean(data**2))
[docs] def block_bootstrap(data, blocksize=1000, nsamples=1000, statistic=rms): """ Estimate the standard deviation of a statistic via circular block bootstrap. Accounts for temporal autocorrelation in MD time series by resampling contiguous blocks rather than individual frames. Uses the *circular* (wraparound) variant to avoid boundary effects at the end of the array. Parameters ---------- data : array_like 1-D time-ordered array of data points. blocksize : int, optional Number of consecutive frames in each resampled block (default ``1000``). Should be larger than the autocorrelation time of the observable. nsamples : int, optional Number of independent bootstrap replicates to generate (default ``1000``). statistic : callable, optional Function applied to each bootstrap replicate to compute the quantity of interest. Must accept a 1-D array and return a scalar. Default is :func:`rms`; other useful choices include ``np.mean`` or ``np.std``. Returns ------- sd : float Bootstrap standard deviation of ``statistic`` across all replicates, computed with ``ddof=1``. Notes ----- Each bootstrap replicate is built by drawing random start indices, extracting blocks of length ``blocksize`` with circular index wrapping (``index % n``), and concatenating until the sample reaches length ``n``. The final sample is trimmed to exactly ``n`` frames. Choosing ``blocksize`` is the main tuning parameter: too small underestimates the error; too large increases variance of the estimator. A common heuristic is to set it to roughly 5–10× the autocorrelation time. Examples -------- >>> sd = block_bootstrap(rg_timeseries, blocksize=500, statistic=np.mean) >>> print(f'Bootstrap std of <Rg>: {sd:.4f} Å') """ data = np.asarray(data) n = len(data) bootstrap_stats = [] for _ in range(nsamples): # Generate bootstrap sample using circular block bootstrap bootstrap_sample = [] while len(bootstrap_sample) < n: start_idx = np.random.randint(0, n) block = [] for i in range(blocksize): block.append(data[(start_idx + i) % n]) bootstrap_sample.extend(block) # trim to exact length bootstrap_sample = bootstrap_sample[:n] bootstrap_stats.append(statistic(bootstrap_sample)) sd = np.std(bootstrap_stats, ddof=1) return sd
[docs] def remove_zero(xs, ys): """ Find the x-range that spans the non-zero support of a distribution. Scans ``ys`` from both ends to locate the first and last non-zero values, then returns the corresponding ``xs`` positions offset by ±5 index positions to retain a small margin around the support. Parameters ---------- xs : array_like 1-D array of x-axis values (e.g., bin centers). ys : array_like 1-D array of y-axis values (e.g., PDF or PMF). Must be the same length as ``xs``. Returns ------- x_nozero : 1-D array x values within the non-zero support of the distribution. y_nozero : 1-D array Corresponding y values within the non-zero support. Examples -------- >>> x, y = scatter2hist(distances, num_bin=50, styles='pdf') >>> x_new, y_new = remove_zero(x, y) >>> ax.plot(x_new, y_new) """ for idx in range(len(xs)): if ys[idx] != 0: start = idx - 5 break for idx in reversed(range(len(xs))): if ys[idx] != 0: end = idx + 5 break return xs[start:end], ys[start:end]
[docs] def myload(filename, *args, **kwargs): """ Load a whitespace-delimited text file and unpack each column as a separate array. Wraps ``numpy.loadtxt`` and returns each column as an individual element of a tuple, making it convenient to unpack multi-column data files with a single assignment. Up to eight trailing ``0`` sentinels are appended so that callers can safely unpack into more variables than there are columns without raising a ``ValueError``. Parameters ---------- filename : str or path-like Path to the text file to load. *args Additional positional arguments forwarded to ``numpy.loadtxt`` (e.g., ``skiprows``). **kwargs Additional keyword arguments forwarded to ``numpy.loadtxt`` (e.g., ``delimiter``, ``usecols``, ``dtype``). Returns ------- tuple A tuple whose first ``n`` elements are 1-D ``numpy.ndarray`` objects, one per column in the file, followed by eight integer zeros as padding. The total length is therefore ``n + 8``. Notes ----- The eight trailing zeros allow patterns like:: col1, col2, *_ = myload('data.txt') without the caller needing to know the exact column count. However, the padding means ``len(myload(...))`` is always ``n_cols + 8`` regardless of the actual file width. Examples -------- >>> time, rg, energy, *_ = myload('observables.dat', skiprows=1) >>> plt.plot(time, rg) """ data = np.loadtxt(filename, *args, **kwargs) cols = len(data[0, :]) results = [] for col in range(cols): results.append(data[:, col]) results = results + [0, 0, 0, 0, 0, 0, 0, 0] return tuple(results)
[docs] def get_overlap(x1, y1, x2, y2): """ Find the x values present in both datasets, and return the corresponding y values from each dataset at those shared x positions. No interpolation is performed — only x values that literally appear in both x1 and x2 are included in the output. The result is independent of argument order: get_overlap(x1, y1, x2, y2) and get_overlap(x2, y2, x1, y1) will return the same x_overlap (though y1_overlap and y2_overlap will swap). Parameters ---------- x1 : array-like X-coordinates of the first dataset (assumed sorted ascending, no duplicates). y1 : array-like Y-values corresponding to x1. x2 : array-like X-coordinates of the second dataset (assumed sorted ascending, no duplicates). y2 : array-like Y-values corresponding to x2. Returns ------- x_overlap : np.ndarray X-values present in both x1 and x2, sorted ascending. y1_overlap : np.ndarray Y-values from dataset 1 at x_overlap. y2_overlap : np.ndarray Y-values from dataset 2 at x_overlap. Raises ------ ValueError If x1 and x2 share no common x values. Example ------- >>> x1 = [1, 2, 3, 4, 5] >>> y1 = [10, 20, 30, 40, 50] >>> x2 = [3, 4, 5, 6, 7] >>> y2 = [300, 400, 500, 600, 700] >>> x, y1_out, y2_out = get_overlap(x1, y1, x2, y2) >>> # x = [3, 4, 5], y1_out = [30, 40, 50], y2_out = [300, 400, 500] """ x1, y1, x2, y2 = map(np.asarray, (x1, y1, x2, y2)) x_overlap = np.intersect1d(x1, x2) # values present in both, order-independent if len(x_overlap) == 0: raise ValueError( f"No overlapping x values: x1 spans [{x1.min()}, {x1.max()}], " f"x2 spans [{x2.min()}, {x2.max()}]." ) # Index directly — no interpolation needed since both arrays contain these x values y1_overlap = y1[np.isin(x1, x_overlap)] y2_overlap = y2[np.isin(x2, x_overlap)] return x_overlap, y1_overlap, y2_overlap
[docs] def stack_jagged(arrays, model="min"): """ Stack a list of 1D arrays as columns, aligning on the overlap. Parameters ---------- arrays : list of array-like Input arrays or lists, can have different lengths. model : {'min', 'max'}, optional 'min' : trim all arrays to the shortest length (default). 'max' : pad shorter arrays with NaN to match the longest length. Returns ------- np.ndarray 2D array of shape (min_len, n_arrays) if model='min', or (max_len, n_arrays) if model='max'. """ arrays = [np.asarray(arr, dtype=float) for arr in arrays] if model == "min": target_len = min(len(arr) for arr in arrays) arrays = [arr[:target_len] for arr in arrays] elif model == "max": target_len = max(len(arr) for arr in arrays) arrays = [ np.pad(arr, (0, target_len - len(arr)), constant_values=np.nan) for arr in arrays ] result = arrays[0] for arr in arrays[1:]: result = np.column_stack((result, arr)) return result
[docs] def mylinregress(xs, ys, xrange=None): """ Perform linear regression on (x, y) data, optionally restricted to a specific x-range. Parameters ---------- xs : array-like Independent variable values. ys : array-like Dependent variable values, same length as xs. xrange : tuple of (float, float), optional If provided, only data points with x in [xrange[0], xrange[1]] are plotted. Returns ------- r_value : float Correlation coefficient of the linear fit. x_fit : np.ndarray X values for the fitted line, either spanning the full range of xs or limited to xrange if provided. y_fit : np.ndarray Corresponding y values of the fitted line at x_fit. """ from scipy.stats import linregress xs = np.asarray(xs) ys = np.asarray(ys) slope, intercept, r_value, p_value, std_err = linregress(xs, ys) if xrange: x_fit = np.linspace(xrange[0], xrange[1], 200) y_fit = slope * x_fit + intercept else: x_fit = np.linspace(xs.min()*0.9, xs.max()*1.1, 200) y_fit = slope * x_fit + intercept return r_value, x_fit, y_fit