The The class biomodelcacheretrieval
queries the BioModels cache using key words and downloads the relevant BioModels.
class BioModelCacheRetrieval:
def __init__(self, search_str, github_owner="TheBobBob", github_repo_cache="BiomodelsCache"):
self.search_str = search_str
self.github_owner = github_owner
self.github_repo_cache = github_repo_cache
def search_models(self):
BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
cached_data = None
url = f"{self.github_owner}/{self.github_repo_cache}/contents/{BIOMODELS_JSON_DB_PATH}"
headers = {"Accept": "application/vnd.github+json"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
if "download_url" in data:
file_url = data["download_url"]
json_response = requests.get(file_url)
cached_data = json_response.json()
print(f"Failed to retrieve data from GitHub. Status code: {response.status_code}")
return {}
query_text = self.search_str.strip().lower()
models = {}
if cached_data:
for model_id, model_data in cached_data.items():
if 'name' in model_data:
name = model_data['name'].lower()
url = model_data['url']
id = model_data['model_id']
title = model_data['title']
authors = model_data['authors']
model_info = ' '.join([str(v).lower() for v in model_data.values()])
if query_text in model_info and model_id not in models:
models[model_id] = {
'ID': model_id,
'name': name,
'url': url,
'id': id,
'title': title,
'authors': authors,
return models
def download_model_files(model_url, model_id):
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
model_url = f"{model_id}/{model_id}_url.xml"
response = requests.get(model_url)
if response.status_code == 200:
os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
with open(file_path, 'wb') as file:
print(f"Model {model_id} downloaded successfully: {file_path}")
return file_path
print(f"Failed to download the model from {model_url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error downloading model {model_id} from {model_url}: {e}")
return None
The convert_sbml_to_antimony
function converts the SBML format models into Antimony for the text splitter.
from convert_sbml_to_antimony import convert_sbml_to_antimony
if model_file_path:
# Step 3: Convert the downloaded SBML model to Antimony format
antimony_file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.txt")
convert_sbml_to_antimony(model_file_path, antimony_file_path)
The split_biomodels
function splits the downloaded Antimony format models for use by the text splitter.
from split_biomodels import split_biomodels
final_items = split_biomodels(antimony_file_path)
The create_vector_db
function creates the vector database that the biomodels are passed into. Before they are passed in, each segment of biomodel is summarized using an LLM so it is easier to query using the cosine similarity.
from create_vector_db import create_vector_db
if all_final_items:
db = create_vector_db(all_final_items)
The generate_response
function queries the vector database and generates a response to a user-provided query.
from generate_response import generate_response
query_text = input("Enter your question about the model(s): ")
response = generate_response(db, query_text)
print(f"Response: {response}")
Below is an example of how to structure each function into the full pipeline. The pipeline below does the following....
- Gets user input for a search query.
- Searches models based on the query.
- Downloads and processes each model.
- Converts models to Antimony format and splits them.
- Creates a vector database from the split biomodels.
- Queries the database and generates a response based on user input.
It can also be broken down into parts to use each function separately.
from biomodelcache import BioModelCacheRetrieval
from convert_sbml_to_antimony import convert_sbml_to_antimony
from split_biomodels import split_biomodels
from create_vector_db import create_vector_db
from generate_response import generate_response
import os
import tempfile
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
def main():
retriever = BioModelCacheRetrieval(search_str)
models = retriever.search_models()
if models:
all_final_items = []
for model_id, model_data in models.items():
print(f"Processing model: {model_data['name']}")
model_url = model_data['url']
model_file_path = retriever.download_model_files(model_url, model_id)
if model_file_path:
antimony_file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.txt")
convert_sbml_to_antimony(model_file_path, antimony_file_path)
final_items = split_biomodels(antimony_file_path)
if all_final_items:
db = create_vector_db(all_final_items)
query_text = input("Enter your question about the model(s): ")
response = generate_response(db, query_text)
print(f"Response: {response}")
return ValueError("No models were processed successfully.")
return ValueError("No models found matching your search query.")
if __name__ == "__main__":
search_str = input("Enter search query: ")