feat(utils/opatio): added python module for interfacing with opat files

A python module (opatio) has been written to make the creation and reading of opat files straight forward
2025-02-15 11:50:39 -05:00
parent 6b777b220a
commit 4ec8293088
5 changed files with 340 additions and 0 deletions
--- a/utils/opatio/pyproject.toml
+++ b/utils/opatio/pyproject.toml
@@ -0,0 +1,16 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "opatio"
+version = "0.1.0a"
+description = "A python module for handling OPAT files"
+readme = "readme.md"
+authors = [{name = "Emily M. Boudreaux", email = "emily.boudreaux@dartmouth.edu"}]
+requires-python = ">=3.8"
+dependencies = ["numpy >= 1.21.1"]
+
+[tool.setuptools]
+packages = ["opatio", "opatio.opat"]
+package-dir = {"" = "src"}
--- a/utils/opatio/readme.md
+++ b/utils/opatio/readme.md
@@ -0,0 +1,46 @@
+# opatIO python module
+This module defines a set of tools to build, write, and read OPAT files. 
+The OPAT fileformat is a custom file format designed to efficiently store
+opacity information for a variety of compositions. 
+
+## Installation
+You can install this module with pip
+```bash
+git clone <repo>
+cd 4DSSE/utils/opat
+pip install .
+```
+
+## General Usage
+The general way that this module is mean to be used is to first build a schema for the opaticy table and then save that to disk. The module will handle all the byte aligment and lookup table construction for you. 
+
+A simple example might look like the following
+
+```python
+from opatio import OpatIO
+
+opacityFile = OpatIO()
+opacityFile.set_comment("This is a sample opacity file")
+opaticyFile.set_source("OPLIB")
+
+# some code to get a logR, logT, and logKappa table
+# where logKappa is of size (n,m) if logR is size n and
+# logT is size m
+
+opacityFile.add_table(X, Z, logR, logT, logKappa)
+opacityFile.save("opacity.opat")
+```
+
+You can also read opat files which have been generated with the loadOpat function
+
+```python
+from opatio import loadOpat
+
+opacityFile = loadOpat("opacity.opat")
+
+print(opacityFile.header)
+print(opaticyFile.tables[0])
+```
+
+## Problems
+If you have problems feel free to either submit an issue to the root github repo (tagged as utils/opatio) or email Emily Boudreaux at emily.boudreaux@dartmouth.edu
--- a/utils/opatio/src/opatio/init.py
+++ b/utils/opatio/src/opatio/init.py
@@ -0,0 +1 @@
+from .opat.opat import OpatIO, loadOpat
--- a/utils/opatio/src/opatio/opat/init.py
+++ b/utils/opatio/src/opatio/opat/init.py
--- a/utils/opatio/src/opatio/opat/opat.py
+++ b/utils/opatio/src/opatio/opat/opat.py
@@ -0,0 +1,277 @@
+import struct
+import numpy as np
+from datetime import datetime
+
+from dataclasses import dataclass
+
+from typing import Iterable, List, Tuple
+from collections.abc import Iterable as collectionIterable
+
+import hashlib
+
+import os
+
+@dataclass
+class Header:
+    magic: str
+    version: int
+    numTables: int
+    headerSize: int
+    indexOffset: int
+    creationDate: str
+    sourceInfo: str
+    comment: str
+    reserved: bytes
+
+@dataclass
+class TableIndex:
+    X: float
+    Z: float
+    byteStart: int
+    byteEnd: int
+    sha256: bytes
+
+@dataclass
+class OPATTable:
+    N_R: int
+    N_T: int
+    logR: Iterable[float]
+    logT: Iterable[float]
+    logKappa: Iterable[Iterable[float]]
+
+defaultHeader = Header(
+    magic="OPAT",
+    version=1,
+    numTables=0,
+    headerSize=256,
+    indexOffset=0,
+    creationDate=datetime.now().strftime("%b %d, %Y"),
+    sourceInfo="no source provided by user",
+    comment="default header",
+    reserved=b"\x00" * 26
+)
+
+class OpatIO:
+    def __init__(self):
+        self.header: Header = defaultHeader
+        self.tables: List[Tuple[Tuple[float, float], OPATTable]] = []
+
+    @staticmethod
+    def validate_char_array_size(s: str, nmax: int) -> bool:
+        if len(s) > nmax:
+            return False
+        return True
+
+    @staticmethod
+    def validate_logKappa(logKappa):
+        if isinstance(logKappa, np.ndarray):
+            if logKappa.ndim == 2:
+                return
+            else:
+                raise ValueError("logKappa must be a non-empty 2D array")
+
+        if isintance(logKappa, collectionIterable) and all(isinstance(row, collectionIterable) for row in logKappa):
+            try:
+                first_row = next(iter(logKappa))
+                if all(isinstance(x, (int, float)) for x in first_row):
+                    return
+                else:
+                    raise ValueError("logKappa must be fully numeric")
+            except StopIteration:
+                raise ValueError("logKappa must be a non-empty 2D iterable")
+        else:
+            raise TypeError("logKappa must be a non-empty 2D array or iterable")
+
+    @staticmethod
+    def validate_1D(arr, name: str):
+        if isinstance(arr, np.ndarray):
+            if arr.ndim == 1:
+                return
+            else:
+                raise ValueError(f"{name} must be a 1D numpy array")
+        if isinstance(arr, collectionIterable) and not isinstance(arr, (str, bytes)):
+            if all(isinstance(x, (int, float)) for x in arr):
+                return
+            else:
+                raise ValueError(f"{name} must be fully numeric")
+        else:
+            raise TypeError(f"{name} must be a non-empty 2D array or iterable")
+
+    @staticmethod
+    def compute_checksum(data: bytes) -> bytes:
+        return hashlib.sha256(data).digest()
+
+    def set_version(self, version: int) -> int:
+        self.header.version = version
+        return self.header.version
+
+    def set_source(self, source: str) -> str:
+        if not self.validate_char_array_size(source, 64):
+            raise TypeError(f"sourceInfo string ({source}) is too long ({len(source)}). Max length is 64")
+        self.header.sourceInfo = source
+        return self.header.sourceInfo
+
+    def set_comment(self, comment: str) -> str:
+        if not self.validate_char_array_size(comment, 128):
+            raise TypeError(f"comment string ({comment}) is too long ({len(comment)}). Max length is 128")
+        self.header.comment = comment
+        return self.header.comment
+
+    def add_table(self, X: float, Z: float, logR: Iterable[float], logT: Iterable[float], logKappa: Iterable[Iterable[float]]):
+        self.validate_logKappa(logKappa)
+        self.validate_1D(logR, "logR")
+        self.validate_1D(logT, "logT")
+
+        logR = np.array(logR)
+        logT = np.array(logT)
+        logKappa = np.array(logKappa)
+
+        if logKappa.shape != (logR.shape[0], logT.shape[0]):
+            raise ValueError(f"logKappa must be of shape ({len(logR)} x {len(logT)})! Currently logKappa has shape {logKappa.shape}")
+
+        table = OPATTable(
+            N_R = logR.shape[0],
+            N_T = logT.shape[0],
+            logR = logR,
+            logT = logT,
+            logKappa = logKappa
+        )
+    
+        self.tables.append(((X, Z), table))
+        self.header.numTables += 1
+
+
+    def _header_bytes(self) -> bytes:
+        headerBytes = struct.pack(
+            "<4s H I I Q 16s 64s 128s 26s",
+            self.header.magic.encode('utf-8'),
+            self.header.version,
+            self.header.numTables,
+            self.header.headerSize,
+            self.header.indexOffset,
+            self.header.creationDate.encode('utf-8'),
+            self.header.sourceInfo.encode('utf-8'),
+            self.header.comment.encode('utf-8'),
+            self.header.reserved
+        )
+        return headerBytes
+
+    def _table_bytes(self, table: OPATTable) -> Tuple[bytes, bytes]:
+        logR = table.logR.flatten()
+        logT = table.logT.flatten()
+        logKappa = table.logKappa.flatten()
+        tableBytes = struct.pack(
+            f"<II{table.N_R}d{table.N_T}d{table.N_R*table.N_T}d",
+            table.N_R,
+            table.N_T,
+            *logR,
+            *logT,
+            *logKappa
+        )
+        checksum = self.compute_checksum(tableBytes)
+        return (checksum, tableBytes)
+
+    def _tableIndex_bytes(self, tableIndex: TableIndex) -> bytes:
+        tableIndexBytes = struct.pack(
+            '<ddQQ',
+            tableIndex.X,
+            tableIndex.Z,
+            tableIndex.byteStart,
+            tableIndex.byteEnd
+        )
+        tableIndexBytes += tableIndex.sha256
+
+        if len(tableIndexBytes) != 64:
+            raise RuntimeError(f"Each table index entry must have 64 bytes. Due to an unknown error the table index entry for (X,Z)=({tableIndex.X},{tableIndex.Z}) header has {len(tableIndexBytes)} bytes")
+        
+        return tableIndexBytes
+
+    def save(self, filename: str) -> str:
+        tempHeaderBytes = self._header_bytes()
+
+        if len(tempHeaderBytes) != 256:
+            raise RuntimeError(f"Header must have 256 bytes. Due to an unknown error the header has {len(tempHeaderBytes)} bytes")
+
+        currentStartByte: int = 256
+        tableIndicesBytes: List[bytes] = []
+        tablesBytes: List[bytes] = []
+        for (X, Z), table in self.tables:
+            checksum, tableBytes = self._table_bytes(table)
+            tableIndex = TableIndex(
+                X = X,
+                Z = Z,
+                byteStart = currentStartByte,
+                byteEnd = currentStartByte + len(tableBytes),
+                sha256 = checksum
+            )
+            tableIndexBytes = self._tableIndex_bytes(tableIndex)
+            tablesBytes.append(tableBytes)
+            tableIndicesBytes.append(tableIndexBytes)
+
+            currentStartByte += len(tableBytes)
+        self.header.indexOffset = currentStartByte
+        headerBytes = self._header_bytes()
+
+        with open(filename, 'wb') as f:
+            f.write(headerBytes)
+            for tableBytes in tablesBytes:
+                f.write(tableBytes)
+            for tableIndexBytes in tableIndicesBytes:
+                f.write(tableIndexBytes)
+
+        if os.path.exists(filename):
+            return filename
+
+
+def loadOpat(filename: str) -> OpatIO:
+    opat = OpatIO()
+    with open(filename, 'rb') as f:
+        headerBytes: bytes = f.read(256)
+        unpackedHeader = struct.unpack("<4s H I I Q 16s 64s 128s 26s", headerBytes)
+        loadedHeader = Header(
+            magic = unpackedHeader[0].decode(),
+            version = unpackedHeader[1],
+            numTables = unpackedHeader[2],
+            headerSize = unpackedHeader[3],
+            indexOffset = unpackedHeader[4],
+            creationDate = unpackedHeader[5].decode(),
+            sourceInfo = unpackedHeader[6].decode(),
+            comment = unpackedHeader[7].decode(),
+            reserved = unpackedHeader[8]
+        )
+        opat.header = loadedHeader
+        f.seek(opat.header.indexOffset)
+        tableIndices: List[TableIndex] = []
+        while tableIndexEntryBytes := f.read(32):
+            unpackedTableIndexEntry = struct.unpack("<ddQQ", tableIndexEntryBytes)
+            checksum = f.read(32)
+            tableIndexEntry = TableIndex(
+                X = unpackedTableIndexEntry[0],
+                Z = unpackedTableIndexEntry[1],
+                byteStart = unpackedTableIndexEntry[2],
+                byteEnd = unpackedTableIndexEntry[3],
+                sha256 = checksum
+            )
+            tableIndices.append(tableIndexEntry)
+        
+        currentStartByte = opat.header.headerSize
+        f.seek(currentStartByte)
+        for tableIndex in tableIndices:
+            f.seek(tableIndex.byteStart)
+            byteLength = tableIndex.byteEnd - tableIndex.byteStart
+            tableBytes = f.read(byteLength)
+
+            nr_nt_fmt = "<II"
+            nr_nt_size = struct.calcsize(nr_nt_fmt)
+            N_R, N_T = struct.unpack(nr_nt_fmt, tableBytes[:nr_nt_size])
+
+            dataFormat = f"<{N_R}d{N_T}d{N_R*N_T}d"
+            unpackedData = struct.unpack(dataFormat, tableBytes[nr_nt_size:])
+
+            logR = np.array(unpackedData[:N_R], dtype=np.float64)
+            logT = np.array(unpackedData[N_R: N_R+N_T], dtype=np.float64)
+            logKappa = np.array(unpackedData[N_R+N_T:], dtype=np.float64).reshape((N_R, N_T))
+
+            opat.add_table(tableIndex.X, tableIndex.Z, logR, logT, logKappa)
+    return opat
+