feat(utils/opatio): added python module for interfacing with opat files

A python module (opatio) has been written to make the creation and reading of opat files straight forward
This commit is contained in:
2025-02-15 11:50:39 -05:00
parent 6b777b220a
commit 4ec8293088
5 changed files with 340 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "opatio"
version = "0.1.0a"
description = "A python module for handling OPAT files"
readme = "readme.md"
authors = [{name = "Emily M. Boudreaux", email = "emily.boudreaux@dartmouth.edu"}]
requires-python = ">=3.8"
dependencies = ["numpy >= 1.21.1"]
[tool.setuptools]
packages = ["opatio", "opatio.opat"]
package-dir = {"" = "src"}

46
utils/opatio/readme.md Normal file
View File

@@ -0,0 +1,46 @@
# opatIO python module
This module defines a set of tools to build, write, and read OPAT files.
The OPAT fileformat is a custom file format designed to efficiently store
opacity information for a variety of compositions.
## Installation
You can install this module with pip
```bash
git clone <repo>
cd 4DSSE/utils/opat
pip install .
```
## General Usage
The general way that this module is mean to be used is to first build a schema for the opaticy table and then save that to disk. The module will handle all the byte aligment and lookup table construction for you.
A simple example might look like the following
```python
from opatio import OpatIO
opacityFile = OpatIO()
opacityFile.set_comment("This is a sample opacity file")
opaticyFile.set_source("OPLIB")
# some code to get a logR, logT, and logKappa table
# where logKappa is of size (n,m) if logR is size n and
# logT is size m
opacityFile.add_table(X, Z, logR, logT, logKappa)
opacityFile.save("opacity.opat")
```
You can also read opat files which have been generated with the loadOpat function
```python
from opatio import loadOpat
opacityFile = loadOpat("opacity.opat")
print(opacityFile.header)
print(opaticyFile.tables[0])
```
## Problems
If you have problems feel free to either submit an issue to the root github repo (tagged as utils/opatio) or email Emily Boudreaux at emily.boudreaux@dartmouth.edu

View File

@@ -0,0 +1 @@
from .opat.opat import OpatIO, loadOpat

View File

View File

@@ -0,0 +1,277 @@
import struct
import numpy as np
from datetime import datetime
from dataclasses import dataclass
from typing import Iterable, List, Tuple
from collections.abc import Iterable as collectionIterable
import hashlib
import os
@dataclass
class Header:
magic: str
version: int
numTables: int
headerSize: int
indexOffset: int
creationDate: str
sourceInfo: str
comment: str
reserved: bytes
@dataclass
class TableIndex:
X: float
Z: float
byteStart: int
byteEnd: int
sha256: bytes
@dataclass
class OPATTable:
N_R: int
N_T: int
logR: Iterable[float]
logT: Iterable[float]
logKappa: Iterable[Iterable[float]]
defaultHeader = Header(
magic="OPAT",
version=1,
numTables=0,
headerSize=256,
indexOffset=0,
creationDate=datetime.now().strftime("%b %d, %Y"),
sourceInfo="no source provided by user",
comment="default header",
reserved=b"\x00" * 26
)
class OpatIO:
def __init__(self):
self.header: Header = defaultHeader
self.tables: List[Tuple[Tuple[float, float], OPATTable]] = []
@staticmethod
def validate_char_array_size(s: str, nmax: int) -> bool:
if len(s) > nmax:
return False
return True
@staticmethod
def validate_logKappa(logKappa):
if isinstance(logKappa, np.ndarray):
if logKappa.ndim == 2:
return
else:
raise ValueError("logKappa must be a non-empty 2D array")
if isintance(logKappa, collectionIterable) and all(isinstance(row, collectionIterable) for row in logKappa):
try:
first_row = next(iter(logKappa))
if all(isinstance(x, (int, float)) for x in first_row):
return
else:
raise ValueError("logKappa must be fully numeric")
except StopIteration:
raise ValueError("logKappa must be a non-empty 2D iterable")
else:
raise TypeError("logKappa must be a non-empty 2D array or iterable")
@staticmethod
def validate_1D(arr, name: str):
if isinstance(arr, np.ndarray):
if arr.ndim == 1:
return
else:
raise ValueError(f"{name} must be a 1D numpy array")
if isinstance(arr, collectionIterable) and not isinstance(arr, (str, bytes)):
if all(isinstance(x, (int, float)) for x in arr):
return
else:
raise ValueError(f"{name} must be fully numeric")
else:
raise TypeError(f"{name} must be a non-empty 2D array or iterable")
@staticmethod
def compute_checksum(data: bytes) -> bytes:
return hashlib.sha256(data).digest()
def set_version(self, version: int) -> int:
self.header.version = version
return self.header.version
def set_source(self, source: str) -> str:
if not self.validate_char_array_size(source, 64):
raise TypeError(f"sourceInfo string ({source}) is too long ({len(source)}). Max length is 64")
self.header.sourceInfo = source
return self.header.sourceInfo
def set_comment(self, comment: str) -> str:
if not self.validate_char_array_size(comment, 128):
raise TypeError(f"comment string ({comment}) is too long ({len(comment)}). Max length is 128")
self.header.comment = comment
return self.header.comment
def add_table(self, X: float, Z: float, logR: Iterable[float], logT: Iterable[float], logKappa: Iterable[Iterable[float]]):
self.validate_logKappa(logKappa)
self.validate_1D(logR, "logR")
self.validate_1D(logT, "logT")
logR = np.array(logR)
logT = np.array(logT)
logKappa = np.array(logKappa)
if logKappa.shape != (logR.shape[0], logT.shape[0]):
raise ValueError(f"logKappa must be of shape ({len(logR)} x {len(logT)})! Currently logKappa has shape {logKappa.shape}")
table = OPATTable(
N_R = logR.shape[0],
N_T = logT.shape[0],
logR = logR,
logT = logT,
logKappa = logKappa
)
self.tables.append(((X, Z), table))
self.header.numTables += 1
def _header_bytes(self) -> bytes:
headerBytes = struct.pack(
"<4s H I I Q 16s 64s 128s 26s",
self.header.magic.encode('utf-8'),
self.header.version,
self.header.numTables,
self.header.headerSize,
self.header.indexOffset,
self.header.creationDate.encode('utf-8'),
self.header.sourceInfo.encode('utf-8'),
self.header.comment.encode('utf-8'),
self.header.reserved
)
return headerBytes
def _table_bytes(self, table: OPATTable) -> Tuple[bytes, bytes]:
logR = table.logR.flatten()
logT = table.logT.flatten()
logKappa = table.logKappa.flatten()
tableBytes = struct.pack(
f"<II{table.N_R}d{table.N_T}d{table.N_R*table.N_T}d",
table.N_R,
table.N_T,
*logR,
*logT,
*logKappa
)
checksum = self.compute_checksum(tableBytes)
return (checksum, tableBytes)
def _tableIndex_bytes(self, tableIndex: TableIndex) -> bytes:
tableIndexBytes = struct.pack(
'<ddQQ',
tableIndex.X,
tableIndex.Z,
tableIndex.byteStart,
tableIndex.byteEnd
)
tableIndexBytes += tableIndex.sha256
if len(tableIndexBytes) != 64:
raise RuntimeError(f"Each table index entry must have 64 bytes. Due to an unknown error the table index entry for (X,Z)=({tableIndex.X},{tableIndex.Z}) header has {len(tableIndexBytes)} bytes")
return tableIndexBytes
def save(self, filename: str) -> str:
tempHeaderBytes = self._header_bytes()
if len(tempHeaderBytes) != 256:
raise RuntimeError(f"Header must have 256 bytes. Due to an unknown error the header has {len(tempHeaderBytes)} bytes")
currentStartByte: int = 256
tableIndicesBytes: List[bytes] = []
tablesBytes: List[bytes] = []
for (X, Z), table in self.tables:
checksum, tableBytes = self._table_bytes(table)
tableIndex = TableIndex(
X = X,
Z = Z,
byteStart = currentStartByte,
byteEnd = currentStartByte + len(tableBytes),
sha256 = checksum
)
tableIndexBytes = self._tableIndex_bytes(tableIndex)
tablesBytes.append(tableBytes)
tableIndicesBytes.append(tableIndexBytes)
currentStartByte += len(tableBytes)
self.header.indexOffset = currentStartByte
headerBytes = self._header_bytes()
with open(filename, 'wb') as f:
f.write(headerBytes)
for tableBytes in tablesBytes:
f.write(tableBytes)
for tableIndexBytes in tableIndicesBytes:
f.write(tableIndexBytes)
if os.path.exists(filename):
return filename
def loadOpat(filename: str) -> OpatIO:
opat = OpatIO()
with open(filename, 'rb') as f:
headerBytes: bytes = f.read(256)
unpackedHeader = struct.unpack("<4s H I I Q 16s 64s 128s 26s", headerBytes)
loadedHeader = Header(
magic = unpackedHeader[0].decode(),
version = unpackedHeader[1],
numTables = unpackedHeader[2],
headerSize = unpackedHeader[3],
indexOffset = unpackedHeader[4],
creationDate = unpackedHeader[5].decode(),
sourceInfo = unpackedHeader[6].decode(),
comment = unpackedHeader[7].decode(),
reserved = unpackedHeader[8]
)
opat.header = loadedHeader
f.seek(opat.header.indexOffset)
tableIndices: List[TableIndex] = []
while tableIndexEntryBytes := f.read(32):
unpackedTableIndexEntry = struct.unpack("<ddQQ", tableIndexEntryBytes)
checksum = f.read(32)
tableIndexEntry = TableIndex(
X = unpackedTableIndexEntry[0],
Z = unpackedTableIndexEntry[1],
byteStart = unpackedTableIndexEntry[2],
byteEnd = unpackedTableIndexEntry[3],
sha256 = checksum
)
tableIndices.append(tableIndexEntry)
currentStartByte = opat.header.headerSize
f.seek(currentStartByte)
for tableIndex in tableIndices:
f.seek(tableIndex.byteStart)
byteLength = tableIndex.byteEnd - tableIndex.byteStart
tableBytes = f.read(byteLength)
nr_nt_fmt = "<II"
nr_nt_size = struct.calcsize(nr_nt_fmt)
N_R, N_T = struct.unpack(nr_nt_fmt, tableBytes[:nr_nt_size])
dataFormat = f"<{N_R}d{N_T}d{N_R*N_T}d"
unpackedData = struct.unpack(dataFormat, tableBytes[nr_nt_size:])
logR = np.array(unpackedData[:N_R], dtype=np.float64)
logT = np.array(unpackedData[N_R: N_R+N_T], dtype=np.float64)
logKappa = np.array(unpackedData[N_R+N_T:], dtype=np.float64).reshape((N_R, N_T))
opat.add_table(tableIndex.X, tableIndex.Z, logR, logT, logKappa)
return opat