Source code for wailord.io.orca

# -*- coding: utf-8 -*-
"""An ad-hoc treatment of orca output files

This should implement a grammar, but currently consists of a number of utility
structures and functions to parse data from the orca output format

Example:
    See the tests for more

        $ poetry run

Some more details.

Todo:
    * Make grammar
    * Make classes
    * Test the experiments more
    * Pass a list of experiments to ignore
    * Test the ordering of variables
    * Make classes for the order
    * Handle split jobs
    * Propagate exceptions instead of passing the buck with warnings
    * Setup proper logging
    * Scrape NIST Web book for spectra, properties https://webbook.nist.gov/cgi/cbook.cgi?ID=C71432&Mask=800#Electronic-Spec
    * Document more things (e.g. SP -> Single point calculation a.k.a. energy
      calculation)
    * Return interesting things

.. _Google Python Style Guide:
   http://google.github.io/styleguide/pyguide.html

"""

import wailord.io as waio
import wailord.utils as wau

import numpy as np
import pandas as pd
import itertools as itertt

import re
import os
import textwrap
import pint
import pint_pandas
import warnings
import vg

from pathlib import Path
from functools import reduce
from collections import namedtuple, OrderedDict
from operator import itemgetter
from pandas.api.types import CategoricalDtype
from konfik import Konfik

# Pint setup
PA_ = pint_pandas.PintArray
ureg = pint.UnitRegistry()
pint_pandas.PintType.ureg = ureg
Q_ = ureg.Quantity

ureg.define("kcal_mol = kcal / 6.02214076e+23 = kcm")

inpcart = namedtuple("inpcart", "atype x y z")
orcaout = namedtuple("orcaout", "final_energy fGeom basis filename system spin theory")

ORDERED_THEORY = ["HF", "UHF", "QCISD", "CCSD", "QCISD(T)", "CCSD(T)"]

ORDERED_BASIS = [
    "STO-3G",
    "3-21G",
    "6-31G",
    "6-311G",
    "6-311G*",
    "6-311G**",
    "6-311++G**",
    "6-311++G(2d,2p)",
    "6-311++G(2df,2pd)",
    "6-311++G(3df,3pd)",
]

OUT_REGEX = {
    "cartesian_coord": re.compile(r"CARTESIAN\s*COORDINATES\s*\(ANGSTROEM\)\s*"),
    "final_single_point_e": re.compile(r"(?<=FINAL SINGLE POINT ENERGY)\s*-\d*.?\d*"),
    "basis_set": re.compile(r"Orbital\s*basis\s*set\s*information"),
    "MDCI": re.compile(r"The\s*Calculated\s*Surface\s*using\s*the\s*MDCI\s*energy\n"),
    "MDCI w/o Triples": re.compile(
        r"The Calculated Surface using the MDCI energy minus triple correction\s*"
    ),
    "Actual Energy": re.compile(r"The Calculated Surface using the 'Actual Energy'"),
    "SCF Energy": re.compile(r"The Calculated Surface using the SCF energy"),
    "energy_evals": re.compile(r"There will be\s*\d* energy evaluations"),
    "Mulliken": re.compile(r"MULLIKEN ATOMIC CHARGES"),
    "Loewdin": re.compile(r"LOEWDIN ATOMIC CHARGES"),
    "irSpectrum": re.compile(r"IR SPECTRUM"),
    "vpt2trans": re.compile(r"Fundamental transition"),
    "Vibrational Frequency": re.compile(r"VIBRATIONAL FREQUENCIES"),
}

# ------------ Refactor


[docs]def parseOut(filename, plotter=False):
    """Handles orca outputs with regex for energy and coordinates"""
    intcreg, fsp_ereg, get_basis = itemgetter(
        "cartesian_coord", "final_single_point_e", "basis_set"
    )(OUT_REGEX)
    get_spec = re.compile(r"Number\s*of\s*atoms\s*.*\s*\d*")
    with open(filename) as f:
        fInp = f.read()
        fin_energ = float(fsp_ereg.findall(fInp)[-1].split()[-1]) * ureg.hartree
        if plotter == True:
            energ = [float(x.split()[-1]) for x in fsp_ereg.findall(fInp)]
        num_species = int(get_spec.search(fInp).group(0).split()[-1])
    with open(filename) as f:
        flines = f.readlines()
        allAtoms = []
        for linum, line in enumerate(flines):
            if get_basis.search(line):
                basis = flines[linum + 1].split()[-1]
            if intcreg.search(line):
                offset = linum + 2
                for i in range(num_species):
                    p = flines[offset + i].split()
                    myAtom = inpcart(
                        atype=p[0],
                        x=float(p[1]) * ureg.angstrom,
                        y=float(p[2]) * ureg.angstrom,
                        z=float(p[3]) * ureg.angstrom,
                    )
                    allAtoms.append(myAtom)
    runinfo = getRunInfo(Path(filename).parent)
    if runinfo["spin"] == "spin_01":
        spin = "singlet"
    elif runinfo["spin"] == "spin_03":
        spin = "triplet"
    else:
        raise (NotImplementedError(f"Not yet implemented {runinfo['spin']}"))
    finGeom = []
    for i in reversed(range(1, num_species + 1)):
        finGeom.append(allAtoms[-i])
    #  Creates a dictionary of the system H num O num
    systr = pd.DataFrame(finGeom).atype.value_counts().to_dict()
    # Flattens the dictionary to a list
    listdict = list(itertt.chain.from_iterable(systr.items()))
    # Flattens the list to a single string
    liststr = "".join(map(str, listdict))
    oout = orcaout(
        final_energy=fin_energ,
        fGeom=finGeom,
        basis=basis,
        filename=filename,
        system=liststr,
        spin=spin,
        theory=runinfo["theory"],
    )
    if plotter == True:
        return oout, energ
    else:
        return oout


[docs]def get_e(orcaoutdat, basis, system):
    """
    This takes in an orcaout data frame and spits out the energy
    """
    return orcaoutdat[
        (orcaoutdat.basis.isin([basis]) & (orcaoutdat.system.isin([system])))
    ]["final_energy"].to_list()[0]


[docs]def getBL(dat, x, y, z, indi=[0, 1]):
    """Takes in a data frame of xyz coordinates and uses it to calculate the bond length"""
    v1 = np.array(
        [dat.x[indi[0]].magnitude, dat.y[indi[0]].magnitude, dat.z[indi[0]].magnitude]
    )
    v2 = np.array(
        [dat.x[indi[1]].magnitude, dat.y[indi[1]].magnitude, dat.z[indi[1]].magnitude]
    )
    return Q_(vg.euclidean_distance(v1, v2), x[indi[0]].units)


[docs]def getBA(dat, x, y, z, indi=[0, 1, 2]):
    """Takes in a data frame of xyz coordinates and uses it to generate the
    plane angle, indices are used such that the first is the relative center"""
    v1 = np.array(
        [dat.x[indi[0]].magnitude, dat.y[indi[0]].magnitude, dat.z[indi[0]].magnitude]
    )
    v2 = np.array(
        [dat.x[indi[1]].magnitude, dat.y[indi[1]].magnitude, dat.z[indi[1]].magnitude]
    )
    v3 = np.array(
        [dat.x[indi[2]].magnitude, dat.y[indi[2]].magnitude, dat.z[indi[2]].magnitude]
    )
    v12 = v2 - v1
    v13 = v3 - v1
    return Q_(vg.angle(v12, v13, units="deg"), "degrees")


[docs]def genEBASet(
    rootdir,
    deci=3,
    latex=False,
    full=False,
    order_basis=ORDERED_BASIS,
    order_theory=ORDERED_THEORY,
):
    """Takes in a Path object, and typically returns bond angles and energies.
    Optionally returns a TeX table or a full dataset with the filenames and
    geometries. Depreciate this eventually."""
    outs = []
    for root, dirs, files in os.walk(rootdir.resolve()):
        for filename in files:
            if "out" in filename and "slurm" not in filename:
                outs.append(parseOut(f"{root}/{filename}"))
    outdat = pd.DataFrame(data=outs)
    basis_type = CategoricalDtype(categories=order_basis, ordered=True)
    theory_type = CategoricalDtype(categories=order_theory, ordered=True)
    outdat["basis"] = outdat["basis"].astype(basis_type)
    outdat["theory"] = outdat["theory"].astype(theory_type)
    # print(outdat.basis[0])
    # print(pd.DataFrame(outdat.fGeom[0]))
    outdat["angle"] = outdat.fGeom.apply(
        lambda geom: getBA(
            pd.DataFrame(geom),
            pd.DataFrame(geom).x,
            pd.DataFrame(geom).y,
            pd.DataFrame(geom).z,
            [0, 1, 2],
        )
    )
    outdat.sort_values(by=["theory", "basis"], ignore_index=True, inplace=True)
    outdat.final_energy = outdat.final_energy.apply(
        lambda x: np.around(x, decimals=deci)
    )
    outdat.angle = outdat.angle.apply(lambda x: np.around(x, decimals=deci))
    if latex == True:
        return outdat.drop(["filename", "fGeom"], axis=1).to_latex(
            caption="Calculated systems at all basis sets", index=True
        )
    elif full == True:
        return outdat
    else:
        outdat.drop(["filename", "fGeom"], axis=1, inplace=True)
    return outdat


# ------------- Keep ------------------


[docs]def getRunInfo(runf):
    """Determines the runtime parameters from the output path

    The implementation uses an ordered dictionary to ensure that the path
    fragments are matched to the correct keys.

    Note:
        This will only work with wailord experiments at the moment

    Args:
        run (:obj:`Path`): Runtime output path
    Returns:
        runinf (:obj:`dict`): A simple unordered dictionary of paramters
    """
    runinf = OrderedDict(
        {"basis": None, "calc": None, "spin": None, "theory": None, "slug": None}
    )
    rfparts = runf.parts
    for num, od in enumerate(runinf, start=1):
        runinf[od] = rfparts[-num]
    runinf["basis"] = runinf["basis"].replace("PP", "++").replace("8", "*")
    runinf["theory"] = runinf["theory"].replace("_", " ")
    return dict(runinf)


[docs]def calc_htst(product, reactant, transition_state, temperature):
    """Calculates the HTST rate constant.

    This is calculated as:

    .. math::
       k^{HTST}  = \frac{∏_{i=1 … N}𝜈ᵢ}{∏_{j=1 … N-1}𝜈ⱼ^‡}e^{\frac{-𝛥E}{k_BT}}

    Where
    * :math:`𝜈ᵢ` are the final/initial state vibrational frequencies and
      :math:`𝜈ⱼ^‡` are the vibrational frequencies at the saddle point
    * :math:`𝛥E` is the difference between the energy of the state and the
      saddle point :math:`𝛥E=V_{saddle}-V_{other}`

    It is important to note that the :math:`k_f` and :math:`k_b` are relative
    measures and the rate constant is the ratio of the two. In some cases it
    might be necessary to weigh the results by the magnitude of 'c'

    Args:
        product(:obj:`orcaVis`): The product visitor class
        reactant(:obj:`orcaVis`): The reactant visitor class
        transition_state(:obj:`orcaVis`): The transition state visitor class
        temperature(`float`): The temperature

    Returns:
        kout(:obj:`rateconst`) : Returns a named tuple of forward and backward rates
    """
    prod = product.vib_freq()
    react = reactant.vib_freq()
    ts = transition_state.vib_freq()
    ppnum = prod.freq.loc[~(prod.freq <= 0)].to_numpy()
    reactnum = react.freq.loc[~(react.freq <= 0)].to_numpy()
    tsdenom = ts.freq.loc[~(ts.freq <= 0)].to_numpy()
    temp = Q_(temperature, "K")
    # FIXME: Should this be the ureg.speed_of_light instead?
    conv = Q_(1, "m/s")
    preProd = ppnum.prod() / tsdenom.prod()
    preReact = reactnum.prod() / tsdenom.prod()
    delProd = transition_state.fin_sp_e - product.fin_sp_e
    delReact = transition_state.fin_sp_e - reactant.fin_sp_e
    prodExp = -1 * (delProd / (ureg.boltzmann_constant * temp))
    reactExp = -1 * (delReact / (ureg.boltzmann_constant * temp))
    kf = (preProd * conv).to_base_units() * np.exp(prodExp.to_base_units())
    kb = (preReact * conv).to_base_units() * np.exp(reactExp.to_base_units())
    return (kf, kb)


[docs]class orcaExp:
    """The class meant to handle experiments generated with wailord.

    The general concept is that this is meant to work with the setup wailord
    generates. Remember to use `df.round()` for pretty printing!
    """

    def __init__(
        self, expfolder, deci=3, order_basis=ORDERED_BASIS, order_theory=ORDERED_THEORY
    ):
        """Initializes base parameters


        Args:
            expfolder (:obj:`Path`): Output path to the generated wailord experiment
            order_basis (:obj:`list`, optional): An ordered list for the basis
                sets. Defaults to `ORDERED_BASIS`
            order_theory (:obj:`list`, optional): An ordered list for the basis
                sets. Defaults to `ORDERED_THEORY`. Unlike `order_basis` is can
                vary significantly across experiments.
        """
        self.inpconf = None  #: Populated by `handle_exp`
        self.orclist = None  #: Populated by `handle_exp`
        self.order_basis = order_basis
        self.order_theory = order_theory
        self.handle_exp(expfolder)

    def __repr__(self):
        string = f"""
        Experiment: {self.inpconf}
        Outputs: {self.orclist}
        Ordered Theory: {self.order_theory}
        Ordered Basis: {self.order_basis}
        """
        return textwrap.dedent(string)

[docs]    def handle_exp(self, efol):
        """Populates the internal file variables from the path

        Alert:
             This is *not* meant to be called by the user!!!!

        Args:
            efol (:obj:`Path`): Output path

        """
        fnames = []
        self.inpconf = Konfik(config_path=efol / "orca.yml").config
        for root, dirs, files in os.walk(efol.resolve()):
            for filename in files:
                if "out" in filename and "slurm" not in filename:
                    fnames.append(Path(f"{root}/{filename}"))
        self.orclist = fnames
        return

[docs]    def get_final_sp_energy(self):
        """Returns a datframe of only the final single point energies

        Proxies calls to the base orcaVis class over a series of generated files

        Args:
            None

        Returns:
            pd.DataFrame: Returns a data frame of final energies
        """
        edatl = []
        for runf in self.orclist:
            runorc = orcaVis(runf).final_sp_e()
            edatl.append(runorc)
        fe = pd.DataFrame(edatl)
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        fe["basis"] = fe["basis"].astype(basis_type)
        fe["theory"] = fe["theory"].astype(theory_type)
        fe.sort_values(
            by=["theory", "basis", "final_sp_energy"], ignore_index=True, inplace=True
        )
        return fe

[docs]    def get_ir_spec(self):
        """Returns a datframe of the "ir spectrum"

        Proxies calls to the base orcaVis class over a series of generated files

        Args:
            None

        Returns:
            pd.DataFrame: Returns a data frame of frequencies
        """
        vdatl = []
        for runf in self.orclist:
            runorc = orcaVis(runf).ir_spec()
            vdatl.append(runorc)
        ve = pd.concat(vdatl, axis=0)
        ve = ve.drop_duplicates()
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        ve["basis"] = ve["basis"].astype(basis_type)
        ve["theory"] = ve["theory"].astype(theory_type)
        ve.sort_values(by=["theory", "basis"], ignore_index=True, inplace=True)
        return ve

[docs]    def get_vib_freq(self):
        """Returns a datframe of the vibrational modes

        Proxies calls to the base orcaVis class over a series of generated files

        Args:
            None

        Returns:
            pd.DataFrame: Returns a data frame of frequencies
        """
        vdatl = []
        for runf in self.orclist:
            runorc = orcaVis(runf).vib_freq()
            vdatl.append(runorc)
        ve = pd.concat(vdatl, axis=0)
        ve = ve.drop_duplicates()
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        ve["basis"] = ve["basis"].astype(basis_type)
        ve["theory"] = ve["theory"].astype(theory_type)
        ve.sort_values(by=["theory", "basis"], ignore_index=True, inplace=True)
        return ve

[docs]    def get_vpt2_transitions(self):
        """Returns a datframe of the fundamental transitions table

        Proxies calls to the base orcaVis class over a series of generated files

        Args:
            None

        Returns:
            pd.DataFrame: Returns a data frame of frequencies
        """
        vdatl = []
        for runf in self.orclist:
            runorc = orcaVis(runf).vpt2_transitions()
            vdatl.append(runorc)
        ve = pd.concat(vdatl, axis=0)
        ve = ve.drop_duplicates()
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        ve["basis"] = ve["basis"].astype(basis_type)
        ve["theory"] = ve["theory"].astype(theory_type)
        ve.sort_values(by=["theory", "basis"], ignore_index=True, inplace=True)
        return ve

[docs]    def get_energy_surface(self, etype=["Actual Energy", "SCF Energy"]):
        """Populates an energy surface dataframe

        This essentially walks over the generated set of files, and fills out
        calls to the base orcaVis class.

        Args:
            etype (:obj:`list` of :obj:`str`, optional): This is passed to the base
            `OrcaVis` class call

        Returns:
            pd.DataFrame: Returns a data frame of etype energies

        """
        if type(etype) == str:
            etype = [etype]
        edatl = []
        for runf in self.orclist:
            runsurf = orcaVis(runf).mult_energy_surface(etype=etype)
            edatl.append(runsurf)
        edat = pd.concat(edatl, axis=0)
        edat = edat.drop_duplicates()
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        edat["basis"] = edat["basis"].astype(basis_type)
        edat["theory"] = edat["theory"].astype(theory_type)
        edat.sort_values(
            by=["theory", "basis", "bond_length"], ignore_index=True, inplace=True
        )
        return edat

[docs]    def get_population(self, poptype=["Mulliken", "Loewdin"], /):
        """Populates a population dataframe

        This essentially walks over the generated set of files, and fills out
        calls to the base orcaVis class.

        Args:
            poptype (:obj:`list` of :obj:`str`, optional): This is passed to the base
            `OrcaVis` class call

        Returns:
            pd.DataFrame: Returns a data frame of etype energies

        """
        if type(poptype) == str:
            poptype = [poptype]
        popdatl = []
        for runf in self.orclist:
            runsurf = orcaVis(runf).mult_population_analysis(poptype)
            popdatl.append(runsurf)
        popdat = pd.concat(popdatl, axis=0)
        popdat = popdat.drop_duplicates()
        basis_type = CategoricalDtype(categories=self.order_basis, ordered=True)
        theory_type = CategoricalDtype(categories=self.order_theory, ordered=True)
        popdat["basis"] = popdat["basis"].astype(basis_type)
        popdat["theory"] = popdat["theory"].astype(theory_type)
        popdat.sort_values(by=["theory", "basis"], ignore_index=True, inplace=True)
        return popdat

[docs]    def visit_meta(self, node, visited_children):
        """ Returns the overall output. """
        self.meta = node.text
        return node.text

[docs]    def visit_coord_block(self, node, visited_children):
        """ Makes a dict of the section (as key) and the key/value pairs. """
        cb = node.text.split("\n")
        for i, aline in enumerate(cb):
            each = aline.split()
            cb[i] = "    ".join(each)
        self.coord_block = "\n".join(cb)
        # Could have also just returned and assigned node.text
        return node.text


[docs]class orcaVis:
    """The class meant to handle ORCA output files.

    Todo:
        * Add a grammar and recursive descent later
    """

    def __init__(self, ofile):
        """Output file initialization.

        This is meant to return base objects to the experiment level class.

        Note:
            Do not include the `self` parameter in the ``Args`` section.

        Args:
            ofile (str): The output file generated by ORCA.
            eeval (int): The number of energy evaluations

        """
        self.eeval = None
        self.ofile = ofile
        self.runinfo = getRunInfo(self.ofile.parent)
        self.fin_sp_e = None
        self.get_evals(self.ofile)
        self.get_final_e()

    def __repr__(self):
        return f"{self.ofile}"

[docs]    def get_evals(self, ofile):
        with open(self.ofile) as of:
            flines = of.readlines()
            for line in flines:
                if OUT_REGEX["energy_evals"].search(line):
                    self.eeval = int(line.split()[3])
        return

[docs]    def get_final_e(self, dat=False):
        with open(self.ofile) as of:
            fInp = of.read()
            try:
                self.fin_sp_e = (
                    float(
                        OUT_REGEX["final_single_point_e"].findall(fInp)[-1].split()[-1]
                    )
                    * ureg.hartree
                )
            except:
                raise (
                    ValueError(f"Final single point energy not found for {self.ofile}")
                )
        pass

[docs]    def final_sp_e(self):
        erow = self.runinfo
        erow["final_sp_energy"] = self.fin_sp_e.m
        erow["unit"] = self.fin_sp_e.u
        return erow

[docs]    def mult_energy_surface(
        self,
        etype=["Actual Energy", "MDCI", "MDCI w/o Triples", "SCF Energy"],
        npoints=None,
    ):
        """Multiple Energy surface dataframe generator

        This is a helper function to obtain a dataframe which contains multiple
        energy surfaces. The implementation leverages the `reduce` function from
        `functools` to merge a list of dataframes generated from the
        `single_energy_surface` calls.

        Args:
            etype (str,optional): The type of calculated energy surface to
                return. Defaults to `["Actual Energy", "MDCI", "MDCI w/o Triples",
                "SCF Energy"]` but can be any valid subset of the same.
            npoints (int,optional): The number of points over which a scan has
                taken place. Defaults to the number of evaluations calculated in
                the output file.

        Returns:
            pd.DataFrame: Returns a data frame of bond_length and energies

        .. _MDCI:
            https://www.its.hku.hk/services/research/hpc/software/orca

        """
        if isinstance(etype, str) or len(etype) == 1:
            if isinstance(etype, list):
                etype = etype[0]
            single = self.single_energy_surface(
                etype=etype
            )  #: Short circuit if single type is requested
            for key in self.runinfo.keys():
                single[key] = self.runinfo[key]
            return single
        elist = []
        for et in etype:
            runsurf = self.single_energy_surface(etype=et)
            elist.append(runsurf)
        eDat_all = reduce(lambda df1, df2: pd.merge(df1, df2, on="bond_length"), elist)
        for key in self.runinfo.keys():
            eDat_all[key] = self.runinfo[key]
        return eDat_all

[docs]    def single_energy_surface(self, etype="Actual Energy", npoints=None):
        """Single energy surface dataframe generator

        For say, QCISD(T), this is essentially the same as a QCISD calculation.

        Note:
            `MDCI`_ types are meant to work with single reference correlation
            methods

        Args:
            etype (str,optional): The type of calculated energy surface to
            return. Defaults to 'Actual Energy' and can be any of `["Actual Energy", "MDCI", "MDCI w/o Triples", "SCF Energy"]`
            npoints (int,optional): The number of points over which a scan has
                taken place. Defaults to the number of evaluations calculated in
                the output file.

        Returns:
            pd.DataFrame: Returns a data frame of energy surfaces

        .. _MDCI:
            https://www.its.hku.hk/services/research/hpc/software/orca

        """
        if etype not in OUT_REGEX:
            raise (NotImplementedError(f"{etype} has not been implemented yet"))
        if npoints == None:
            npoints = self.eeval
        xaxis = []
        yaxis = []
        sregexp = OUT_REGEX[etype]
        with open(self.ofile) as of:
            flines = of.readlines()
            for lnum, line in enumerate(flines):
                if sregexp.search(line):
                    offset = lnum + 1
                    for i in range(npoints):
                        x, y = flines[offset + i].split()
                        xaxis.append(x)
                        yaxis.append(y)
        edat = pd.DataFrame(
            data=zip(xaxis, yaxis), columns=["bond_length", etype], dtype="float64"
        )
        if edat.empty:
            raise (
                ValueError(f"{etype} surface not found for {self.runinfo['theory']}")
            )
        return edat

[docs]    def vib_freq(self):
        """Get the vibrational frequencies, and fails if there are more than one
        imaginary frequency"""
        sregexp = OUT_REGEX["Vibrational Frequency"]
        vline = namedtuple("vline", "Mode freq imaginary")
        accumulate = []
        with open(self.ofile) as of:
            flines = of.readlines()
            for lnum, line in enumerate(flines):
                if sregexp.search(line):
                    offset = lnum + 5
                    i = 0
                    while flines[offset + i] != "\n":
                        raw = flines[offset + i].split()
                        if len(raw) == 3:
                            v = vline(
                                Mode=int(raw[0].replace(":", "")),
                                freq=float(raw[1]),
                                imaginary=False,
                            )
                        elif len(raw) == 5:
                            v = vline(
                                Mode=int(raw[0].replace(":", "")),
                                freq=float(raw[1]),
                                imaginary=True,
                            )
                        accumulate.append(v)
                        i = i + 1
        vdat = pd.DataFrame(accumulate)
        vdat["freq"] = vdat["freq"].astype("pint[cm_1]")
        # TODO: Add experiment layer
        # TODO: Error if more than one imaginary
        # Check if greater 100 then it is not numerical
        # if vdat.empty:
        #     raise (
        #         ValueError(
        #             f"Spectra not found for {self.runinfo['theory']}, did you run FREQ?"
        #         )
        #     )
        # elif vdat.imaginary.value_counts()[1] > 1:
        #     warnings.warn(
        #         f"More than one imaginary mode, you did not find a saddle point",
        #         UserWarning,
        #     )
        # elif vdat.imaginary.value_counts()[1] == 0:
        #     raise (
        #         ValueError(
        #             "No imaginary frequencies found, check the geometry of reactant and product configurations"
        #         )
        #     )
        return vdat

[docs]    def vpt2_transitions(self):
        """Grabs the fundamental transition analysis from a VPT2 calculation"""
        sregexp = OUT_REGEX["vpt2trans"]
        vline = namedtuple("vline", "Mode harmonic_freq vpt2_freq freq_diff")
        accumulate = []
        with open(self.ofile) as of:
            flines = of.readlines()
            for lnum, line in enumerate(flines):
                if sregexp.search(line):
                    offset = lnum + 4
                    i = 0
                    while "---" not in flines[offset + i]:
                        raw = flines[offset + i].split()
                        v = vline(
                            Mode=int(raw[0]),
                            harmonic_freq=float(raw[1]),
                            vpt2_freq=float(raw[2]),
                            freq_diff=float(raw[3]),
                        )
                        accumulate.append(v)
                        i = i + 1
        vdat = pd.DataFrame(accumulate)
        vdat["harmonic_freq"] = vdat["harmonic_freq"].astype("pint[cm_1]")
        vdat["vpt2_freq"] = vdat["vpt2_freq"].astype("pint[cm_1]")
        vdat["freq_diff"] = vdat["freq_diff"].astype("pint[cm_1]")
        if vdat.empty:
            raise (
                ValueError(
                    f"Data not found for {self.runinfo['theory']}, did you run VPT2?"
                )
            )
        else:
            for key in self.runinfo.keys():
                vdat[key] = self.runinfo[key]
        return vdat

[docs]    def ir_spec(self):
        """Grabs the non-ZPE corrected IR Spectra and the dipole derivatives for
        intensities"""
        sregexp = OUT_REGEX["irSpectrum"]
        vline = namedtuple("vline", "Mode freq T2 TX TY TZ")
        accumulate = []
        with open(self.ofile) as of:
            flines = of.readlines()
            for lnum, line in enumerate(flines):
                if sregexp.search(line):
                    offset = lnum + 5
                    i = 0
                    while flines[offset + i] != "\n":
                        raw = flines[offset + i].split()
                        raw = [i for i in raw if i != ":" if i != "(" if i != ")"]
                        v = vline(
                            Mode=int(raw[0].replace(":", "")),
                            freq=float(raw[1]),
                            T2=float(raw[2]),
                            TX=float(raw[3].replace("(", "")),
                            TY=float(raw[4]),
                            TZ=float(raw[5].replace(")", "")),
                        )
                        accumulate.append(v)
                        i = i + 1
        vdat = pd.DataFrame(accumulate)
        vdat["T2"] = vdat["T2"].astype("pint[km/mol]")
        vdat["freq"] = vdat["freq"].astype("pint[cm_1]")
        if vdat.empty:
            raise (
                ValueError(
                    f"Spectra not found for {self.runinfo['theory']}, did you run FREQ?"
                )
            )
        else:
            for key in self.runinfo.keys():
                vdat[key] = self.runinfo[key]
        return vdat

[docs]    def single_population_analysis(self, poptype="Mulliken", /):
        """Single population analysis dataframe generator

        Args:
            poptype (str,optional): The type of population analysis to
            return. Defaults to 'Mulliken'.

        Returns:
            pd.DataFrame: Returns a data frame of the population analysis

        """
        if poptype not in OUT_REGEX:
            raise (NotImplementedError(f"{poptype} has not been implemented yet"))
        sregexp = OUT_REGEX[poptype]
        chargeline = namedtuple("chargeline", "anum atype pcharge")
        fulline = namedtuple("fulline", "anum atype pcharge pspin")
        accumulate = []
        with open(self.ofile) as of:
            flines = of.readlines()
            for lnum, line in enumerate(flines):
                if sregexp.search(line):
                    offset = lnum + 2
                    i = 0
                    while (
                        "Sum" not in flines[offset + i]
                        and "--" not in flines[offset + i + 1]
                    ):
                        raw = flines[offset + i].split()
                        if "SPIN" in line:
                            c = fulline(
                                anum=raw[0],
                                atype=raw[1],
                                pcharge=float(raw[-2]),
                                pspin=float(raw[-1]),
                            )
                        else:
                            c = chargeline(
                                anum=raw[0],
                                atype=raw[1],
                                pcharge=float(raw[-1]),
                            )
                        accumulate.append(c)
                        i = i + 1
        popdat = pd.DataFrame(accumulate)
        step = popdat.anum.count() / popdat.anum.nunique()
        popdat["step"] = np.asarray(
            np.repeat(np.arange(1, step + 1), popdat.anum.nunique()), dtype=int
        )
        popdat["population"] = poptype
        if popdat.empty:
            raise (ValueError(f"{poptype} not found for {self.runinfo['theory']}"))
        return popdat

[docs]    def mult_population_analysis(self, poptype=["Mulliken", "Loewdin"], /):
        """Multiple population analysis dataframe generator

        This is a helper function to obtain a dataframe which contains multiple
        population analysis outputs. The implementation is similar to the energy surface helper.

        Args:
            poptype (str,optional): The type of calculated energy surface to
                return.

        Returns:
            pd.DataFrame: Returns a data frame of population analysis types
        """
        if isinstance(poptype, str) or len(poptype) == 1:
            if isinstance(poptype, list):
                poptype = poptype[0]
            single = self.single_population_surface(poptype)
            for key in self.runinfo.keys():
                single[key] = self.runinfo[key]
            return single
        poplist = []
        for pt in poptype:
            runsurf = self.single_population_analysis(pt)
            poplist.append(runsurf)
        popdat = reduce(lambda df1, df2: pd.concat([df1, df2]), poplist)
        for key in self.runinfo.keys():
            popdat[key] = self.runinfo[key]
        return popdat