Source code for subproptools.qtaim_extract

"""qtaimExtract
Tools for extracting data from sum files(and CC from agpviz), and calculating substituent properties specifically
"""
# pylint:disable=too-many-lines
import ast
import math  # sqrt

import numpy as np  # arrays
import pandas as pd  # data frames

# import os  # file system stuff


def _search_str(
    linesObj: list[str], word: str, searchStart: int = 0, ignore: int = 0
) -> int:
    """Given lines of file, return line that word is on. or -1 if not found"""
    wordLine = -1  # will return -1 if string not found
    for ln_num, line in enumerate(linesObj):  # iterate over lines
        if line.find(word) > -1 and linesObj.index(line) >= searchStart:
            if ignore == 0:
                wordLine = ln_num
                break
            ignore = ignore - 1
    return wordLine


def _get_li_or_di(atom_label, lab, di_table, li_table):
    """Returns LI(atom_label) if atom_label==lab or DI(atom_label,lab)/2 if atom_label!=lab"""
    if lab == atom_label:
        li_id = li_table.index[li_table["Atom"] == lab][0]
        res = float(li_table.at[li_id, "LI(A)"])
    else:
        # check for both cases, where atom_label is in Atom A or where it is in Atom B
        atom_in_A = (di_table["Atom A"] == atom_label) & (di_table["Atom B"] == lab)
        atom_in_B = (di_table["Atom B"] == atom_label) & (di_table["Atom A"] == lab)
        di_id = di_table.index[atom_in_A | atom_in_B][0]
        res = (
            float(di_table.at[di_id, "DI(A,B)"]) / 2
        )  # divided by 2 from Matta's definition of LDM
    return res


def _get_ldm_column(atom_label, di_table, li_table):
    """Gets a column of the LDM given the atom label of the column"""
    out_list = []
    atom_labels = list(li_table["Atom"])
    out_list = [
        _get_li_or_di(atom_label, lab, di_table, li_table) for lab in atom_labels
    ]
    return out_list


def get_ldm(data: list[str]) -> pd.DataFrame:
    """Given lines of sum file, returns the LDM as defined by Matta

    Args:
        data: lines of a .sum file

    Returns:
        pd.DataFrame of the LDM. LIs on diagonals, DI/2 on off diagonals"""
    di_table = _get_di_table(data)
    li_table = _get_table(
        data=data,
        tableHeader="N(A)             LI(A)            %Loc(A)          DI(A,A')/2       %Deloc(A,A')",
    )
    atom_labels = li_table["Atom"]
    out_frame = pd.DataFrame(index=atom_labels)
    for atom_label in atom_labels:
        out_frame[atom_label] = _get_ldm_column(atom_label, di_table, li_table)
    return out_frame


def _get_di_table(
    data: list[str],
) -> pd.DataFrame:
    """Given lines of sum file, returns the table containing DI information."""
    tableStart = (
        _search_str(
            data,
            "2*D2(A,B)          DI(A,B)          %Deloc(A,B)       %Deloc(B,A)",
            searchStart=0,
        )
        + 2
    )  # data starts 2 lines after header
    tableEnd = (
        _search_str(data, "Areas of Interatomic Surfaces:", tableStart) - 2
    )  # -2 since there is a 2 line gap
    tableData = data[tableStart:tableEnd]
    headerNames = ["Atom A", "Atom B"]
    for (
        colName
    ) in "2*D2(A,B)          DI(A,B)          %Deloc(A,B)       %Deloc(B,A)".split():
        headerNames.append(colName)
    table = pd.DataFrame(columns=headerNames)
    # Add each row of table to Data Frame
    for i, line in enumerate(tableData):
        table.loc[i] = line.split()[0:6]
    return table


def _get_table(
    data: list[str], tableHeader: str, ignored: int = 0, endString: str = "Total"
) -> pd.DataFrame:
    """Given lines of sum file and table header, return pandas dataframe of requested table

    Args:
        data: lines of sum file
        tableHeader: string containing non-atom columns of table to be found
        ignored: - find ignored+1th occurence
        endString: the string found 2 lines after the table data ends

    Returns:
        dataframe of table matching header

    """
    tableStart = (
        _search_str(data, tableHeader, searchStart=0, ignore=ignored) + 2
    )  # data starts 2 lines after header
    tableEnd = (
        _search_str(data, endString, tableStart) - 1
    )  # only deduct 1, because next line will get up to but not including that line
    tableData = data[tableStart:tableEnd]  # get data
    # Include atom here, as sumfile header is Atom A and the whitespace there will mess with how code is writting
    headerNames = ["Atom"]
    for colName in tableHeader.split():
        headerNames.append(colName)
    table = pd.DataFrame(columns=headerNames)
    n_col = len(headerNames)
    for i, line in enumerate(tableData):
        split_line = line.split()
        if "Vol" in tableHeader and len(split_line) < n_col:
            split_line.insert(1, 0.0)
        table.loc[i] = split_line

    return table


def _get_bcp_block(data, atPair=["C1", "H2"]):  # pylint:disable=dangerous-default-value
    """Given sum file lines and pair of atoms, return lines in file corresponding to that BCP."""
    bcpStart = _search_str(data, word=f"Type = (3,-1) BCP {atPair[0]} {atPair[1]}") - 1
    if bcpStart == -2:
        bcpStart = (
            _search_str(data, word=f"Type = (3,-1) BCP {atPair[1]} {atPair[0]}") - 1
        )
    if bcpStart == -2:
        raise ValueError(
            f"BCP between {atPair[0]} and {atPair[1]} not found in {data[12]}"
        )
    bcpEnd = bcpStart + 34
    bcpBlock = data[bcpStart:bcpEnd]
    return bcpBlock  # return the lines of the BCP data



[docs]
def get_sub_di(data: list[str], subAtomLabels=list[str]) -> float:
    """Given lines of sum file and labels of atoms in substituent, return DI between substituent and rest of molecule

    Args:
        data: lines of .sum file
        subAtomLabels: list of atoms in the substituent, e.g.["C1","C2"]

    Returns:
        DI between substituent atoms and rest of molecule

    """
    diTable = _get_di_table(data)
    diTable = diTable.drop(["2*D2(A,B)", "%Deloc(A,B)", "%Deloc(B,A)"], axis=1)
    diClass = []

    diTable = diTable.astype({"DI(A,B)": float})
    for ind in diTable.index:
        if any(x == diTable.at[ind, "Atom A"] for x in subAtomLabels) and any(
            x == diTable.at[ind, "Atom B"] for x in subAtomLabels
        ):
            diClass.append("Substituent-Substituent")
        elif any(x == diTable.at[ind, "Atom A"] for x in subAtomLabels) or any(
            x == diTable.at[ind, "Atom B"] for x in subAtomLabels
        ):
            diClass.append("Substituent-Substrate")
        else:
            diClass.append("Substrate-Substrate")
    diTable.insert(2, "Interaction", diClass, True)
    return sum(
        diTable.loc[diTable["Interaction"] == "Substituent-Substrate", "DI(A,B)"]
    )




[docs]
def get_bcp_properties(
    data: list[str], atPair: list[str] = ["C1", "H2"]
):  # pylint:disable=dangerous-default-value
    """Get BCP Properties for BCP between two atoms in list

    Args:
        data: lines of a .sum file
        atPair: list of atom labels for which to find BCP, e.g. ['C1','H2']

    Returns:
        dictionary containing properties of atPair bcp with keys
            'xyz' (np array of xyz coords),
            'rho',
            lambda1,
            lambda2,
            lambda3,
            DelSqRho,
            Ellipticity,
            V,
            G,
            H

    """
    bcpBlock = _get_bcp_block(data, atPair)
    bcpDict = {}
    for line in bcpBlock:  # iterate over lines
        splitLine = (
            line.split()
        )  # split the line into individual words based on whitespace
        if "Coords" in splitLine:  # if line contains the coordinates, get coordinates
            bcpDict.update(
                {
                    "xyz": np.array(
                        [float(splitLine[4]), float(splitLine[5]), float(splitLine[6])]
                    )
                }
            )
        elif "Rho" in splitLine:  # get density at BCP
            bcpDict.update({"rho": [float(splitLine[2])]})
        elif "HessRho_EigVals" in splitLine:  # get lambda_i
            bcpDict.update({"lambda1": [float(splitLine[2])]})
            bcpDict.update({"lambda2": [float(splitLine[3])]})
            bcpDict.update({"lambda3": [float(splitLine[4])]})
        elif "DelSqRho" in splitLine:  # get DelSqRho
            bcpDict.update({"DelSqRho": [float(splitLine[2])]})
        elif "Ellipticity" in splitLine:  # get ellipticity
            bcpDict.update({"Ellipticity": [float(splitLine[3])]})
        elif "V" in splitLine:  # get V
            bcpDict.update({"V": [float(splitLine[2])]})
        elif "G" in splitLine:  # get G
            bcpDict.update({"G": [float(splitLine[2])]})
    bcpDict.update(
        {"H": [bcpDict["V"][0] + bcpDict["G"][0]]}
    )  # get total energy density
    return bcpDict




[docs]
def get_atomic_props(data: list[str]) -> dict:
    """Returns a dictionary of atomic properties

    Args:
        data: the string lines of a .sum file

    Returns:
        A dictionary of atomic properties with one key for each atom label e.g. 'C1'
        Each of those dictionaries has the following keys:
        xyz, q, K, K_Scaled, Mu_Intra_X, Mu_Intra_Y, Mu_Intra_Z, Mu_Bond_X, Mu_Bond_Y, Mu_Bond_Z,
        Mu_X, Mu_Y, Mu_Z, ``|Mu_Intra|``, ``|Mu_Bond|``, ``|Mu|``, Q_XX, Q_XY, Q_XZ, Q_YY, Q_YZ,
        Q_ZZ, R+2, R+1, Vol, quadContrib

    Note:
        Volume is calculated at the 0.001 au isosurface
        quadContrib is the atomic contribution to the molecular
        quadrupole moment from Laidig TODO add citation

    """
    xyzFrame = _get_table(
        data,
        "Charge                X                  Y                  Z",
        endString="Some Atomic Properties:",
    )
    eFrame = _get_table(
        data,
        "q(A)              L(A)              K(A)          K_Scaled(A)      |Mu_Intra(A)|",
    )
    muXFrame = _get_table(data, "Mu_Intra_X(A)     Mu_Bond_X(A)        Mu_X(A)")
    muYFrame = _get_table(data, "Mu_Intra_Y(A)     Mu_Bond_Y(A)        Mu_Y(A)")
    muZFrame = _get_table(data, "Mu_Intra_Z(A)     Mu_Bond_Z(A)        Mu_Z(A)")
    muMagFrame = _get_table(data, "|Mu_Intra(A)|     |Mu_Bond(A)|        |Mu(A)|")
    quadFrame = _get_table(
        data,
        "Q_XX(A)           Q_XY(A)           Q_XZ(A)           Q_YY(A)           Q_YZ(A)           Q_ZZ(A)",
        endString="Eigenvalues and Eigenvectors of Atomic Traceless Quadrupole Moment Tensors",
    )
    radFrame = _get_table(
        data,
        "R-2(A)            R-1(A)            R0(A)             R+1(A)            R+2(A)",
        endString="Atomic Radial Distortion Moments",
    )
    volFrame = _get_table(
        data,
        "Area(A)           Vol(A)          N(Vol(A))      N(Vol(A))/Vol(A)     %N(Vol(A))",
        ignored=1,
    )
    allatomDict = {}  # initialize empty dictionary to store dictionaries of the atoms
    for atom in eFrame["Atom"]:
        atomDict = {}  # create a dictionary for each atom
        atomDict.update(
            {
                "xyz": np.array(
                    [
                        float(xyzFrame[xyzFrame["Atom"] == atom]["X"].iloc[0]),
                        float(xyzFrame[xyzFrame["Atom"] == atom]["Y"].iloc[0]),
                        float(xyzFrame[xyzFrame["Atom"] == atom]["Z"].iloc[0]),
                    ]
                )
            }
        )
        atomDict.update({"q": [float(eFrame[eFrame["Atom"] == atom]["q(A)"].iloc[0])]})
        atomDict.update({"K": [float(eFrame[eFrame["Atom"] == atom]["K(A)"].iloc[0])]})
        atomDict.update(
            {"K_Scaled": [float(eFrame[eFrame["Atom"] == atom]["K_Scaled(A)"].iloc[0])]}
        )
        atomDict.update(
            {
                "Mu_Intra_X": [
                    float(muXFrame[muXFrame["Atom"] == atom]["Mu_Intra_X(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {
                "Mu_Intra_Y": [
                    float(muYFrame[muYFrame["Atom"] == atom]["Mu_Intra_Y(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {
                "Mu_Intra_Z": [
                    float(muZFrame[muZFrame["Atom"] == atom]["Mu_Intra_Z(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {
                "Mu_Bond_X": [
                    float(muXFrame[muXFrame["Atom"] == atom]["Mu_Bond_X(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {
                "Mu_Bond_Y": [
                    float(muYFrame[muYFrame["Atom"] == atom]["Mu_Bond_Y(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {
                "Mu_Bond_Z": [
                    float(muZFrame[muZFrame["Atom"] == atom]["Mu_Bond_Z(A)"].iloc[0])
                ]
            }
        )
        atomDict.update(
            {"Mu_X": [float(muXFrame[muXFrame["Atom"] == atom]["Mu_X(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Mu_Y": [float(muYFrame[muYFrame["Atom"] == atom]["Mu_Y(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Mu_Z": [float(muZFrame[muZFrame["Atom"] == atom]["Mu_Z(A)"].iloc[0])]}
        )
        atomDict.update(
            {
                "|Mu_Intra|": [
                    float(
                        muMagFrame[muMagFrame["Atom"] == atom]["|Mu_Intra(A)|"].iloc[0]
                    )
                ]
            }
        )
        atomDict.update(
            {
                "|Mu_Bond|": [
                    float(
                        muMagFrame[muMagFrame["Atom"] == atom]["|Mu_Bond(A)|"].iloc[0]
                    )
                ]
            }
        )
        atomDict.update(
            {"|Mu|": [float(muMagFrame[muMagFrame["Atom"] == atom]["|Mu(A)|"].iloc[0])]}
        )
        atomDict.update(
            {"Q_XX": [float(quadFrame[quadFrame["Atom"] == atom]["Q_XX(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Q_XY": [float(quadFrame[quadFrame["Atom"] == atom]["Q_XY(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Q_XZ": [float(quadFrame[quadFrame["Atom"] == atom]["Q_XZ(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Q_YY": [float(quadFrame[quadFrame["Atom"] == atom]["Q_YY(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Q_YZ": [float(quadFrame[quadFrame["Atom"] == atom]["Q_YZ(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Q_ZZ": [float(quadFrame[quadFrame["Atom"] == atom]["Q_ZZ(A)"].iloc[0])]}
        )
        atomDict.update(
            {"R+2": [float(radFrame[radFrame["Atom"] == atom]["R+2(A)"].iloc[0])]}
        )
        atomDict.update(
            {"R+1": [float(radFrame[radFrame["Atom"] == atom]["R+1(A)"].iloc[0])]}
        )
        atomDict.update(
            {"Vol": [float(volFrame[volFrame["Atom"] == atom]["Vol(A)"].iloc[0])]}
        )
        atomDict.update({"quadContrib": _get_atomic_quad_contrib(atomDict)})
        allatomDict.update(
            {atom: atomDict}
        )  # add the atomic dictionary to the dictionary of dictionaries
    return allatomDict



def _get_dist(coordListA, coordListB):
    """Return cartesian distance between two 3 dimensional np arrays."""
    dist = math.sqrt(
        (coordListA[0] - coordListB[0]) ** 2
        + (coordListA[1] - coordListB[1]) ** 2
        + (coordListA[2] - coordListB[2]) ** 2
    )
    return dist


def _is_bonded_cc(ccXYZ, atomDict, originAtom):
    """Checks if a charge concentration is a bonded charge concentration"""
    atom_labels = atomDict.keys()
    on_a_line = False
    for a_1 in atom_labels:
        if a_1 != originAtom:
            if _is_on_line(atomDict[a_1]["xyz"], atomDict[originAtom]["xyz"], ccXYZ):
                on_a_line = True
                break

    return on_a_line


def identify_vscc(
    multiccDict: dict, atomDict: dict, originAtom: str, thresh: float = 0.7
) -> dict:
    """Filter charge concentrations for an atom to only return Valence Shell Charge Concentrations

    Args:
        multiccDict: dictionary of cc properties for all ccs of an atom
        atomDict: get_atomic_props return object
        thresh: distance between inner shell and outer shell ccs
        originAtom: label of atom in molecule, e.g. 'C1'
        #(eg inner shell charge concentration is more than 0.7 au closer to nuclei than VSCC)
    Returns:
        subset of multiccDict correspondng to valence shell charge concentrations

    Note:
        VSCCs identified by: not being on nuclei, not being on line between atoms, and being the outermost CCs
        The default threshold is 0.7 au from a preliminary data analysis. For Si/P/S/Cl, this was found to be
        greater than the distance between inner and outer charge concentrations
    """

    vsccDict = {}  # initialize empty dictionary for cc
    nucDistList = []  # empty list to store distances
    potentialccList = (
        []
    )  # empty list to store keys for potential VSCC after eliminating some criteria
    for cc in multiccDict:  # for each cc
        if (
            multiccDict[cc]["distFromNuc"] > 0.1
        ):  # this will filter out the innermost cc that are on the nuclei
            ccXYZ = multiccDict[cc]["xyz"]  # get the xyz coords of the cc
            # nucPair = _find_closest_nuclei(ccXYZ,atomDict) #find the two closest atoms to the cc
            # if it is a bonded cc, it will be the nuclei to which it is bonded
            # check if the cc is on the bond between those two nuclei
            isBonded = _is_bonded_cc(ccXYZ, atomDict, originAtom)
            if (
                not isBonded
            ):  # if it is not on the line, it is potentially a VSCC, store it, and its distance
                nucDistList.append(multiccDict[cc]["distFromNuc"])
                potentialccList.append(cc)
    # at this point, non-bonded, non-core CC remain, so just need to check which CC are the outermost
    # do this by comparing to maximum distance from nuclei
    if len(nucDistList) > 0:
        outerShellDist = max(nucDistList)
    for cc in potentialccList:
        if (
            abs(
                multiccDict[cc]["distFromNuc"]
                - outerShellDist  # pylint:disable=used-before-assignment
            )
            < thresh
        ):
            # for given cc, if close (within thresh) to outermost, it is a VSCC, store it as such
            # So, going up to 3rd row for now. May work for 4th but I haven't checked
            # C/O/F easy - any not core not bonded should be VSCC
            # P/S/Si harder
            # in example data inner~0.25-0.3, outer ~ 1.2-1.3
            # arbitrary default threshold seems like 0.7 will work
            vsccDict.update({cc: multiccDict[cc]})
    return vsccDict



[docs]
def get_atom_vscc(
    filename, atomLabel: str, atomicProps: dict, is_lines_data: bool = False
):
    """Returns a dicitonary of the properties of VSCC for the atom

    Args:
        filename: name of sum file without .sum extension OR lines of atomLabel's agpviz file if is_lines_data is True
        atomLabel: label of atom in molecule, e.g. 'C1'
        atomicProps: return value of a get_atomic_props function
        is_lines_data: is filename actually the lines of an agpviz file instead of filename

    Returns:
        dictionary of dictionaries of all VSCC concentrations for atomLabel.
            Keys are integer 1,2...n where n is number of cc
            Each of those dict[key] is a dictionary as well containing cc properties

    """
    all_cc_dict = get_cc_props(filename, atomLabel, is_lines_data)
    vscc_dict = identify_vscc(all_cc_dict, atomicProps, atomLabel)
    return vscc_dict



def _is_on_line(lineStartList, lineEndList, pointToCheckList, epsilon=0.3):
    """Given 3d coords for start of line, end of line and point to check, sees if pointToCheck is on line"""
    # line connecting atoms: (x1 + t(x2-x1),y1 + t(y2-y1),z1 + t(z2-z1))
    # to reconstruct - create equations of lines. (pointToCheck-pointWitht) and (lineStart-pointWitht)
    # dot product of those vectors should be 0(closest point is. 90 degrees)
    # rearrange to solve for t
    st_end_mag = math.sqrt(
        ((lineEndList[2] - lineStartList[2]) ** 2)
        + ((lineEndList[1] - lineStartList[1]) ** 2)
        + ((lineEndList[0] - lineStartList[0]) ** 2)
    )
    t = (
        (pointToCheckList[0] - lineStartList[0]) * (lineEndList[0] - lineStartList[0])
        + (pointToCheckList[1] - lineStartList[1]) * (lineEndList[1] - lineStartList[1])
        + (pointToCheckList[2] - lineStartList[2]) * (lineEndList[2] - lineStartList[2])
    ) / st_end_mag
    perpPoint = np.array(
        [
            lineStartList[0] + t * (lineEndList[0] - lineStartList[0]) / st_end_mag,
            lineStartList[1] + t * (lineEndList[1] - lineStartList[1]) / st_end_mag,
            lineStartList[2] + t * (lineEndList[2] - lineStartList[2]) / st_end_mag,
        ]
    )

    # if t is between the end points of the line, and the pointToCheck is within epsilon of that point, it is a bonded cc

    if 0 < t < st_end_mag and _get_dist(pointToCheckList, perpPoint) < epsilon:
        return True
    return False



[docs]
def get_cc_props(filename, atomLabel: str, is_lines_data: bool = False) -> dict:
    """Gets all charge concentration properties for an atom

    Args:
        filename: sum file name:str with no extension or lines of the desired agpviz file as list[str]
        atomLabel: label of atom that we wish to find VSCC for, e.g. 'N3'
        is_lines_data: True if filename is lines of agpviz file, false otherwise

    Returns:
        Dictionary with one nested dictinoary for each VSCC found
        {1: {VSCC1 props},
        2::{VSCC2 props}
        ,...}
        Sub-dictionary keys include: xyz, rho, delsqrho, distFromNuc

    """
    if not is_lines_data:
        # example dir name SubH_CCF-ReorPosY-B3LYP-def2-TZVPPD-Field_atomicfiles
        pathToSubdir = (
            filename + "_atomicfiles" + "/"
        )  # removed os.getcwd() + "/" from start
        with open(
            pathToSubdir + atomLabel.lower() + ".agpviz", encoding="utf-8"
        ) as atFile:
            atData = atFile.readlines()

    else:
        atData = filename
    allccDict = {}
    alldictcounter = 1  # counter since the key will be numeric

    for lnNum, line in enumerate(atData):
        if "Type = (3,+3)" in line:  # if we're looking at CC
            oneccDict = {}  # create empty dict for storing properties for one cc
            xyzSplit = atData[
                lnNum + 1
            ].split()  # line after label is xyz, split it and store xyz
            oneccDict.update(
                {
                    "xyz": np.array(
                        [float(xyzSplit[2]), float(xyzSplit[3]), float(xyzSplit[4])]
                    )
                }
            )
            # next line is distance from nuc, split and store
            distFromNucSplit = atData[lnNum + 2].split()
            oneccDict.update({"distFromNuc": float(distFromNucSplit[2])})
            # next is rho, then delsqrho - split and store those
            rhoSplit = atData[lnNum + 3].split()
            oneccDict.update({"rho": float(rhoSplit[2])})
            delsqrhoSplit = atData[lnNum + 4].split()
            oneccDict.update({"delsqrho": float(delsqrhoSplit[2])})
            allccDict.update({alldictcounter: oneccDict})
            alldictcounter += 1
    return allccDict



def _get_atomic_quad_contrib(atomDict):
    """Given atomDict from get_atomic_props, create dictionary containing atomic quadupole contributions

    Note: formula used from paper by Laidig in 1991
    https://www.sciencedirect.com/science/article/abs/pii/000926149180247U
    """
    secondMomentDict = {}
    secondMomentDict.update(
        {
            "Qxx": atomDict["q"][0] * (atomDict["xyz"][0] ** 2)
            + (atomDict["Q_XX"][0] + atomDict["R+2"][0]) / 3
            + atomDict["xyz"][0] * atomDict["Mu_Intra_X"][0]
            + atomDict["xyz"][0] * atomDict["Mu_Intra_X"][0]
        }
    )
    secondMomentDict.update(
        {
            "Qxy": atomDict["q"][0] * atomDict["xyz"][0] * atomDict["xyz"][1]
            + atomDict["Q_XY"][0] / 3
            + atomDict["xyz"][1] * atomDict["Mu_Intra_X"][0]
            + atomDict["xyz"][0] * atomDict["Mu_Intra_Y"][0]
        }
    )
    secondMomentDict.update(
        {
            "Qxz": atomDict["q"][0] * atomDict["xyz"][0] * atomDict["xyz"][2]
            + atomDict["Q_XZ"][0] / 3
            + atomDict["xyz"][2] * atomDict["Mu_Intra_X"][0]
            + atomDict["xyz"][0] * atomDict["Mu_Intra_Z"][0]
        }
    )
    secondMomentDict.update(
        {
            "Qyy": atomDict["q"][0] * (atomDict["xyz"][1] ** 2)
            + (atomDict["Q_YY"][0] + atomDict["R+2"][0]) / 3
            + atomDict["xyz"][1] * atomDict["Mu_Intra_Y"][0]
            + atomDict["xyz"][1] * atomDict["Mu_Intra_Y"][0]
        }
    )
    secondMomentDict.update(
        {
            "Qyz": atomDict["q"][0] * atomDict["xyz"][1] * atomDict["xyz"][2]
            + atomDict["Q_YZ"][0] / 3
            + atomDict["xyz"][2] * atomDict["Mu_Intra_Y"][0]
            + atomDict["xyz"][1] * atomDict["Mu_Intra_Z"][0]
        }
    )
    secondMomentDict.update(
        {
            "Qzz": atomDict["q"][0] * (atomDict["xyz"][2] ** 2)
            + (atomDict["Q_ZZ"][0] + atomDict["R+2"][0]) / 3
            + atomDict["xyz"][2] * atomDict["Mu_Intra_Z"][0]
            + atomDict["xyz"][2] * atomDict["Mu_Intra_Z"][0]
        }
    )
    secondMomentDict.update(
        {
            "trace": secondMomentDict["Qxx"]
            + secondMomentDict["Qyy"]
            + secondMomentDict["Qzz"]
        }
    )
    atomicQuadrupoleDict = {}
    atomicQuadrupoleDict.update(
        {"Q_xx": [0.5 * (3 * secondMomentDict["Qxx"] - secondMomentDict["trace"])]}
    )
    atomicQuadrupoleDict.update({"Q_xy": [0.5 * (3 * secondMomentDict["Qxy"])]})
    atomicQuadrupoleDict.update({"Q_xz": [0.5 * (3 * secondMomentDict["Qxz"])]})
    atomicQuadrupoleDict.update(
        {"Q_yy": [0.5 * (3 * secondMomentDict["Qyy"] - secondMomentDict["trace"])]}
    )
    atomicQuadrupoleDict.update({"Q_yz": [0.5 * (3 * secondMomentDict["Qyz"])]})
    atomicQuadrupoleDict.update(
        {"Q_zz": [0.5 * (3 * secondMomentDict["Qzz"] - secondMomentDict["trace"])]}
    )
    return atomicQuadrupoleDict  # return dictionary



[docs]
def get_sub_props(atomDict: dict, subAtoms: list[int], atomList: list[str]):
    """Combine atomic properties together into group properties

    Args:
        atomDict: output from get_atomic_props
        subAtoms: list[int] integer labels of atoms in substituent [1, 2,...]
        atomList: list[str] string list of atom labels, ['C1', 'H2'...]

    Returns:
        dictionary of group properties with the following keys: q,K, K_Scaled, Mu_Intra_X, Mu_Intra_Y,
        Mu_Intra_Z, Mu_Bond_X, Mu_Bond_Y,Mu_Bond_Z, Mu_X, Mu_Y, Mu_Z, Q_xx, Q_xy, Q_xz,
        Q_yy,Q_yz,Q_zz,Vol, ``|Mu_Intra|``,``|Mu_Bond|``,``|Mu|``

    Note:
        in output dictionary, each property is stored as a one element list(to enable pandas later)
        Access properties as outDict['key'][0]
    """
    groupDict = {}  # create empty dict
    first = True  # flag for first iteration through loop
    for atom in subAtoms:
        if first:  # if first time, create all elements in dicionary
            first = False  # don't come here again
            if atomList[atom - 1] not in list(atomDict.keys()):
                raise ValueError(f"{atomList[atom-1]} not in atoms in file")
            groupDict = {
                "q": [atomDict[atomList[atom - 1]]["q"][0]],
                "K": [atomDict[atomList[atom - 1]]["K"][0]],
                "K_Scaled": [atomDict[atomList[atom - 1]]["K_Scaled"][0]],
                "Mu_Intra_X": [atomDict[atomList[atom - 1]]["Mu_Intra_X"][0]],
                "Mu_Intra_Y": [atomDict[atomList[atom - 1]]["Mu_Intra_Y"][0]],
                "Mu_Intra_Z": [atomDict[atomList[atom - 1]]["Mu_Intra_Z"][0]],
                "Mu_Bond_X": [atomDict[atomList[atom - 1]]["Mu_Bond_X"][0]],
                "Mu_Bond_Y": [atomDict[atomList[atom - 1]]["Mu_Bond_Y"][0]],
                "Mu_Bond_Z": [atomDict[atomList[atom - 1]]["Mu_Bond_Z"][0]],
                "Mu_X": [atomDict[atomList[atom - 1]]["Mu_X"][0]],
                "Mu_Y": [atomDict[atomList[atom - 1]]["Mu_Y"][0]],
                "Mu_Z": [atomDict[atomList[atom - 1]]["Mu_Z"][0]],
                "Q_xx": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_xx"][0]],
                "Q_xy": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_xy"][0]],
                "Q_xz": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_xz"][0]],
                "Q_yy": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_yy"][0]],
                "Q_yz": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_yz"][0]],
                "Q_zz": [atomDict[atomList[atom - 1]]["quadContrib"]["Q_zz"][0]],
                "Vol": [atomDict[atomList[atom - 1]]["Vol"][0]],
            }
        else:  # for the rest, add the atomic property to the group dictionary element
            for prop in groupDict:  # pylint:disable=consider-using-dict-items
                if "Q" not in prop:
                    groupDict[prop][0] += atomDict[atomList[atom - 1]][prop][0]
                else:
                    groupDict[prop][0] += atomDict[atomList[atom - 1]]["quadContrib"][
                        prop
                    ][0]
    groupDict.update(
        {
            "|Mu_Intra|": [
                math.sqrt(
                    groupDict["Mu_Intra_X"][0] ** 2
                    + groupDict["Mu_Intra_Y"][0] ** 2
                    + groupDict["Mu_Intra_Z"][0] ** 2
                )
            ]
        }
    )
    groupDict.update(
        {
            "|Mu_Bond|": [
                math.sqrt(
                    groupDict["Mu_Bond_X"][0] ** 2
                    + groupDict["Mu_Bond_Y"][0] ** 2
                    + groupDict["Mu_Bond_Z"][0] ** 2
                )
            ]
        }
    )
    groupDict.update(
        {
            "|Mu|": [
                math.sqrt(
                    groupDict["Mu_X"][0] ** 2
                    + groupDict["Mu_Y"][0] ** 2
                    + groupDict["Mu_Z"][0] ** 2
                )
            ]
        }
    )
    return groupDict



def _check_num_atoms(atom_label_list: list[str], atom_int_list: list[str]) -> None:
    """Runs a check to make sure number of atoms equals the expected.

    Returns ValueError if not"""
    num_atoms = len(atom_label_list)
    max_int = max(atom_int_list)
    if max_int > num_atoms:
        raise ValueError(
            f"Largest group atom index {max_int} greater than number of atoms in file"
        )



[docs]
def extract_sub_props(
    data: list[str],
    subAtoms: list[int],
    sumFileNoExt: str,
    groupProps: bool = True,
    bcpId=[[1, 2]],
    lapRhoCpAtoms=[],
):  # pylint:disable=dangerous-default-value
    # pylint:disable=too-many-arguments
    """returns a dictionary of all group properties - bcp, group, atomic, and vscc

    Args:
        data: list[str] - lines of a .sum file
        subAtoms: indices of atoms in the molecule comprising group - starts at 1
        groupProps: do you want to compute group properties
        bcpId: list of 2 length lists. Pass empty list if no bcp properties wanted.
        2 length lists are indices of atoms that you want BCPs for
        lapRhoCpAtoms = list of atom indices that you want to find laprhoCPs for. Defaults to empty

    Returns:
        nested dictionaries with keys 'Group', 'Atomic', 'BCP', 'VSCC'

    Notes:
        vol surface is 0.001 au isodensity

    """
    atomList = list(
        _get_table(
            data,
            "q(A)              L(A)              K(A)          K_Scaled(A)      |Mu_Intra(A)|",
        )["Atom"]
    )
    _check_num_atoms(atomList, subAtoms)
    subatomLabels = [
        atomList[i]
        for i in range(0, len(atomList))
        if any(x == i + 1 for x in subAtoms)
    ]

    atomicProps = get_atomic_props(data)  # get atomic dictionary
    bcpProperties = extract_requested_bcp_props(
        data, atomList, bcpId, subatomLabels, atomicProps
    )
    if groupProps:  # if you want group properties
        subDict = get_sub_props(
            atomicProps, subAtoms, atomList
        )  # get substituent properties
    else:
        subDict = "Group Properties not requested"

    if len(lapRhoCpAtoms) > 0:  # if we want laprhocps for at least one atom
        vsccProps = extract_requested_cc_props(
            lapRhoCpAtoms, sumFileNoExt, atomList, atomicProps
        )
    else:
        vsccProps = {"VSCC Props not requested"}
    # create output dictionary to return all requested properties, if a property not requested return a string stating that
    outDict = {
        "Group": subDict,
        "Atomic": atomicProps,
        "BCP": bcpProperties,
        "VSCC": vsccProps,
    }
    return outDict




[docs]
def extract_requested_cc_props(
    lapRhoCpAtoms: list[int], sumFileNoExt: str, atomList: list[str], atomicProps: dict
) -> dict:
    """Get VSCC dict for requested atoms

    Args:
        lapRhoCpAtoms: indices of atoms(starting at 1) that you want to find VSCC for
        sumFileNoExt: filename of sum file without extension
        atomList: list of atom labels in molecule. e.g. ['C1','H2']
        atomicProps: dictionary output of get_atomic_props

    Returns:
        Nested dictionary with first layer keys:atomLabel, second layer integer label of VSCC,
        third layer properties extracted

    """
    vsccProps = {}
    for atom in lapRhoCpAtoms:  # for each atom requested, get lapRhoCps
        allCC = get_cc_props(sumFileNoExt, atomList[atom - 1])
        vsccProps.update(
            {atomList[atom - 1]: identify_vscc(allCC, atomicProps, atomList[atom - 1])}
        )
    return vsccProps




[docs]
def extract_requested_bcp_props(
    data: list[str],
    atomList: list[str],
    bcpId: list[list[int]],
    subatomLabels: list[str],
    atomicProps: dict,
) -> dict:
    """Get BCP dict for requested bcps

    Args:
        data: lines of .sum file
        atomList: list of atom labels in molecule. e.g. ['C1','H2']
        bcpId: list of 2 length lists.
        subAtomLabels: list of atoms in the substituent, e.g.["C1","C2"]
        atomicProps: dictionary output of get_atomic_props

    Returns:
        Nested dictionary. First level is the ID of bcp requested of form 'A1-A2', second is extracted properties

    """
    bcpIdx = []
    for bcp in bcpId:
        atList = []
        for at in bcp:
            atList.append(at)
        bcpIdx.append(atList)
    bcpProperties = {}  # initialize empty dictionary
    if len(bcpIdx) > 0:
        for bcpPair in bcpIdx:
            bcpPair[0] -= 1  # lists start from 0, indices would be passed from molecule
            bcpPair[1] -= 1  # which would start at 1, so adjust index
            # add the bcp properties to the dictionary under the atom labels
            prop = get_bcp_properties(data, [atomList[i] for i in bcpPair])
            prop.update({"DI(R,G)": [get_sub_di(data, subatomLabels)]})
            for key in atomicProps:
                if key in (atomList[bcpPair[0]], atomList[bcpPair[1]]):
                    if "1" in key:
                        keynum = 1
                    elif "2" in key:
                        keynum = 2
                    prop.update(
                        {
                            f"r(BCP-{keynum})": [
                                _get_dist(atomicProps[key]["xyz"], prop["xyz"])
                            ]
                        }
                    )
            bcpProperties.update(
                {f"{atomList[bcpPair[0]]}-{atomList[bcpPair[1]]}": prop}
            )
    else:  #
        bcpProperties = "BCP Properties not requested"
    return bcpProperties



def get_selected_bcps(data: list[str], bcp_list: list[list[str]]):
    """Given a list of bcps, return dictionary of bcp properties.

    Args:
        data: lines of sum file
        bcp_list: list of 2 length lists.

    Returns:
        Nested dictionary. First level is the ID of bcp requested of form 'A1-A2', second is extracted properties

    Notes: as compared to extract_requested_bcp_props, returns fewer properties - r(a1-a2), no DI(R,G)
    """
    bcpProperties = {}  # initialize empty dictionary
    for bcpPair in bcp_list:
        prop = get_bcp_properties(data, bcpPair)
        bcpProperties.update({f"{bcpPair[0]}-{bcpPair[1]}": prop})
    return bcpProperties


def find_connected(data, negXAtomLabel, originAtomLabel):
    """Given lines of sumfile, atom on -x label, and atom on origin Label, find atoms bonded to origin.

    Args:
        data: list[str]: lines of sumfile
        negXAtomLabel: str: eg. 'H2'
        originAtomLabel: str: eg. 'C1'
    Returns:
        List of List of BCPs connected to origin atom
        e.g.
            [['C1',H3'],['C1','H4'],['C1','H5']]

    """
    # find all atoms to which bonded
    bcpLines = []
    for line in data:
        split_line = line.split()
        if (
            "(3,-1)" in split_line
            and negXAtomLabel not in split_line
            and originAtomLabel in split_line
        ):
            bcpLines.append(line)
    bcpList = []
    for bcp in bcpLines:
        splitbcp = bcp.split()
        bcpList.append([splitbcp[4], splitbcp[5]])
    return bcpList



[docs]
def find_all_connections(data):
    """Given lines of sumfile, finds all BCPs in file.

    Args:
        data: list[str]: lines of sumfile

    Returns:
        List of List of BCPs
        e.g. [['C1',H3'],['C1','H4'],['C1','H5']]

    """
    bcpLines = []
    for line in data:
        split_line = line.split()
        if "(3,-1)" in split_line:
            bcpLines.append(line)
    bcpList = []
    for bcp in bcpLines:
        splitbcp = bcp.split()
        bcpList.append([splitbcp[4], splitbcp[5]])
    return bcpList




[docs]
def sub_prop_frame(csvFile: str) -> dict:  # pylint:disable=too-many-locals
    """Given csv file, extract group properties for all files included and store properties

    Args:
        csvFile: string containing csv file (WITH extension)
        example csvFile structure:
        Substituent, subAtoms,    label1,     label2,...
        CH3          1 3 4 5      SubH_CH3  SubC6H5_CH3
            Substituent: string of substituent
            subAtoms: string of space separated substituent atoms eg '1 3 4'
            label1 - contains .sum file with no extension
            labeli could be e.g. a model chemistry, or substrate
            that is the column "SubH" would have sum files for the substituents attached to H
            SubC6H5 would have sum files for substituents attached to C6H5 etc

    Returns:
        dictionary of dicitonary of data frames containing group properties
        {'label1': {'Group': Pandas Data Frame, 'BCP': Pandas Data Frame}}

    Notes:
        Group frame has columns: Substituent, q,K, K_Scaled, Mu_Intra_X, Mu_Intra_Y, Mu_Intra_Z,
        Mu_Bond_X, Mu_Bond_Y,Mu_Bond_Z, Mu_X, Mu_Y, Mu_Z, Q_xx, Q_xy, Q_xz, Q_yy,Q_yz,Q_zz,Vol,
        ``|Mu_Intra|``,``|Mu_Bond|``,``|Mu|``
        BCP frame has columns: Substituent, rho, delsqrho, lambda1, lambda2, lambda3,V,G,H,DI

    """
    csvFrame = pd.read_csv(csvFile)

    all_label_dict = {}

    ncolumns = csvFrame.shape[1]
    nrow = csvFrame.shape[0]
    subAtoms = []
    for sub in range(0, nrow):  # get subAtoms from csv in list format
        subAtomsString = csvFrame.loc[sub]["subAtoms"].split()
        subAtomsInt = [ast.literal_eval(i) for i in subAtomsString]
        subAtoms.append(subAtomsInt)
    for col in range(
        2, ncolumns
    ):  # 2 is the start of the label columns, so for all labels:  # create empty dictionary for this label
        count = 0  # For first iteration for each label will create the data frame
        for row_num, sub in enumerate(
            csvFrame["Substituent"]
        ):  # iterate over substituents
            sumFileName = csvFrame[csvFrame["Substituent"] == sub][
                csvFrame.columns[col]
            ].iloc[0]
            # get properties
            with open(sumFileName + ".sum", encoding="utf-8") as sumFile:
                data = sumFile.readlines()
            extracted_props = extract_sub_props(data, subAtoms[row_num], sumFileName)
            excludeKeys = ["xyz"]
            extracted_props["Group"].update({"Substituent": sub})
            if count == 0:
                count = 1  # don't come here again for this label (don't need to make data frame again)
                groupFrame = pd.DataFrame.from_dict(
                    extracted_props["Group"], orient="columns"
                )
                # for each bcp properties gotten for - currently only do this for one
                # should work generally soon
                for bnum, bcp in enumerate(
                    extracted_props["BCP"]
                ):  # currently only use for one bcp query
                    tempbcpDict = {
                        k: extracted_props["BCP"][bcp][k]
                        for k in set(list(extracted_props["BCP"][bcp].keys()))
                        - set(excludeKeys)
                    }
                    tempbcpDict.update({"x": [extracted_props["BCP"][bcp]["xyz"][0]]})
                    tempbcpDict.update({"y": [extracted_props["BCP"][bcp]["xyz"][1]]})
                    tempbcpDict.update({"z": [extracted_props["BCP"][bcp]["xyz"][2]]})
                    tempbcpDict.update({"Substituent": sub})
                    tempbcpDict.update({"BCP": bcp})
                    if bnum == 0:  # create data frame for first bcp in list
                        bcpFrame = pd.DataFrame.from_dict(tempbcpDict, orient="columns")
                    else:  # else add to data frame
                        bcpFrame = pd.concat(
                            [bcpFrame, pd.DataFrame(tempbcpDict)], ignore_index=True
                        )
            else:  # add to data frame after first iteration
                groupFrame = pd.concat(
                    [groupFrame, pd.DataFrame(extracted_props["Group"])],
                    ignore_index=True,
                )
                for bcp in extracted_props["BCP"]:
                    tempbcpDict = {
                        k: extracted_props["BCP"][bcp][k]
                        for k in set(list(extracted_props["BCP"][bcp].keys()))
                        - set(excludeKeys)
                    }
                    tempbcpDict.update({"x": [extracted_props["BCP"][bcp]["xyz"][0]]})
                    tempbcpDict.update({"y": [extracted_props["BCP"][bcp]["xyz"][1]]})
                    tempbcpDict.update({"z": [extracted_props["BCP"][bcp]["xyz"][2]]})
                    tempbcpDict.update({"Substituent": sub})
                    tempbcpDict.update({"BCP": bcp})
                    bcpFrame = pd.concat(
                        [bcpFrame, pd.DataFrame(tempbcpDict)], ignore_index=True
                    )
        all_label_dict.update(
            {csvFrame.columns[col]: {"Group": groupFrame, "BCP": bcpFrame}}
        )
    return all_label_dict




[docs]
def get_xyz(data: list[str]) -> dict:
    """Given lines of sumfile, return dicitonary containing xyzcoordinates and atom labels

    Args:
        data: lines of sum file

    Returns:
        Dictionary
            'xyz': dataframe of xyz coordinates of nuclei in sum file
            'Atoms': list of atom labels

    """

    xyzTable = _get_table(
        data,
        "Charge                X                  Y                  Z",
        endString="Some Atomic Properties:",
    )
    xyzTable["X"] = pd.to_numeric(xyzTable["X"], downcast="float")
    xyzTable["Y"] = pd.to_numeric(xyzTable["Y"], downcast="float")
    xyzTable["Z"] = pd.to_numeric(xyzTable["Z"], downcast="float")
    return {
        "xyz": xyzTable[["X", "Y", "Z"]].to_numpy(),
        "Atoms": list(xyzTable["Atom"]),
    }