Hi, I ran into the same issue and ended up writing my own function to retrieve the pathway hierarchies in MetaCyc (MetaCyc Pathways). Hope it might help somebody!
The resulting pandas DataFrame looks like this:
The code:
import pandas as pd
import requests
import json
def dfs(current_node_id, branch_visited):
"""
Depth-First Search (DFS) function to retrieve pathway hierarchies from MetaCyc.
Parameters:
current_node_id (str): The ID of the current node (pathway) being visited.
branch_visited (list): List of pathway IDs and labels visited so far in the current branch.
Returns:
None (The results are stored in the global variable 'recorded_pathways').
"""
global recorded_pathways
# Make a request to get the direct children-pathways of the current node from the MetaCyc website.
response = requests.get(f"https://biocyc.org/META/ajax-direct-subs?object={current_node_id}")
# Process the response (JSON) to retrieve child-pathway information.
for pathway in json.loads(response.text):
next_node_id = pathway["id"] # ID of the child pathway to explore.
next_node_label = pathway["label"] # Label (name) of the child pathway.
# Update the list of visited pathways in the current branch with information of the new child pathway.
branch_updated = branch_visited + [f"{next_node_id}: {next_node_label}"]
# If the child pathway is at the lowest hierarchy (leaf pathway), add it to the recorded pathways.
if pathway["numInstances"] == 0:
recorded_pathways.append(branch_updated)
else:
# Recursively call the DFS function to explore children pathways of the child pathway.
dfs(current_node_id = next_node_id, branch_visited = branch_updated)
return
# retrieving the hierarchy by traversing all pathway pages on the biocyc website using DFS
recorded_pathways = []
dfs(current_node_id = "Pathways", branch_visited = ["Pathways: Pathways"])
# Prepare the data for creating the pandas DataFrame with hierarchical annotations.
max_pathway_hierarchy = max([len(i)-1 for i in recorded_pathways])
padded_recorded_pathways = []
# Loop through the recorded pathways and pad the hierarchy levels for a consistent DataFrame.
for pathway in recorded_pathways:
actual_pathway = pathway[1:]
padded_pathway = actual_pathway
leaf_pathway = pathway[-1]
# Add None to the pathway hierarchy if it is shallower than the the maximum depth.
if len(actual_pathway) < max_pathway_hierarchy:
padded_pathway = actual_pathway + [None] * (max_pathway_hierarchy - len(actual_pathway))
# Store the padded pathway along with the leaf pathway in a dictionary.
padded_recorded_pathways.append({leaf_pathway:padded_pathway})
# Create a DataFrame with the padded hierarchical annotations.
pathway_annotated = pd.DataFrame({})
for pathway in padded_recorded_pathways:
pathway_annotated = pd.concat((pathway_annotated, pd.DataFrame(pathway).T))
# Rename the index to 'feature' for a more descriptive name.
pathway_annotated.rename_axis('feature', inplace = True)
# Create annotated column names 'level_1', 'level_2', etc. based on the hierarchy depth.
annotated_columns = []
for i, col in enumerate(pathway_annotated.columns):
annotated_columns.append(f"level_{i+1}")
pathway_annotated.columns = annotated_columns