Convert a List of CAS numbers to SMILES using Pubmed, CirPy, or a Recurrent Neural Network.

Sulstice
4 min readSep 13, 2024

--

I get asked this question a lot. Which seems to be a little odd but here we go. To convert a list of CAS numbers to SMILES we can do three methods: pubmed or cirpy or write a recurrent neural network to do it for us. CAS numbers as detailed by the diagram are 2–6 digits followed by a hypen for 2 digits and another hyphen than 1 digit.

Method 1: Pubmed

Using the PUG Rest API, we can fetch a SMILES pretty easily however sometimes some values might not exist and you would need to extend to other databases. I have found cirpy to be more flexible in that manner.

import requests
import pandas as pd

def cas_to_smiles(cas):

if not is_valid_cas(cas):
return "Invalid CAS number format"

url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas}/property/IsomericSMILES/JSON"

try:
response = requests.get(url)
response.raise_for_status()
data = response.json()

smiles = data['PropertyTable']['Properties'][0]['IsomericSMILES']
return smiles
except requests.exceptions.RequestException as e:
return f"Error fetching data: {e}"
except (KeyError, IndexError):
return "CAS number not found or SMILES not available"

if __name__ == "__main__":

cas_numbers = [
"50-00-0", # Formaldehyde
"67-64-1", # Acetone
"71-43-2", # Benzene
"64-17-5", # Ethanol
"7732-18-5" # Water
]

molecules = [ cas_to_smiles(cas) for cas in cas_numbers ]

df = pd.DataFrame()
df['CAS'] = cas_numbers
df['SMILES'] = molecules

output_file = 'cas_to_smiles_results.csv'
df.to_csv(output_file, index=False)

print("\nResults:")
print(df)

Method 2: Cirpy

The popular software cirpy is used a lot for chemical translation and is pretty flexible in finding cross references of names. I’ve always found it useful except sometimes you can get timeout requests:

import requests
import pandas as pd

def cas_to_smiles(cas_number):

url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"

try:
response = requests.get(url)
response.raise_for_status()
smiles = response.text.strip()
return smiles if smiles else None

except requests.exceptions.RequestException as e:
print(f"Error fetching data for CAS {cas_number}: {e}")
return None

if __name__ == "__main__":

cas_numbers = [
"50-00-0", # Formaldehyde
"67-64-1", # Acetone
"71-43-2", # Benzene
"64-17-5", # Ethanol
"7732-18-5" # Water
]

print("Converting CAS numbers to SMILES...")
molecules = [ cas_to_smiles(cas) for cas in cas_numbers ]

df = pd.DataFrame()
df['CAS'] = cas_numbers
df['SMILES'] = molecules

print("\nResults:")
print(df)

Method 3: Recurrent Neural Network

This would just be the initial implementation but for something more robust down the line we can implement a recurrent neural network. CAS numbers have a history and some CAS numbers could be the same molecule. It’s a little tough to predict. However, if there is some organization or rules behind it maybe an AI program could find out. This is just a simplistic example.

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

cas_chars = '0123456789-'
smiles_chars = 'C(=O)c1234567890[]' # Simplified SMILES character set

cas_char_to_idx = {char: idx for idx, char in enumerate(cas_chars)}
smiles_char_to_idx = {char: idx for idx, char in enumerate(smiles_chars)}

MAX_CAS_LEN = 12 # Maximum length of CAS number (e.g., "12345-67-8")
MAX_SMILES_LEN = 50 # Maximum length of SMILES string

class CAStoSMILESDataset(Dataset):
def __init__(self, cas_list, smiles_list):
self.cas_list = cas_list
self.smiles_list = smiles_list

def __len__(self):
return len(self.cas_list)

def __getitem__(self, idx):
cas = self.cas_list[idx]
smiles = self.smiles_list[idx]

# Convert CAS to tensor
cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
for i, char in enumerate(cas):
cas_tensor[i] = cas_char_to_idx[char]

# Convert SMILES to tensor
smiles_tensor = torch.zeros(MAX_SMILES_LEN, dtype=torch.long)
for i, char in enumerate(smiles):
smiles_tensor[i] = smiles_char_to_idx[char]

return cas_tensor, smiles_tensor

class CAStoSMILESRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(CAStoSMILESRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)

def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
output = self.out(output.view(1, -1))
return output, hidden

def init_hidden(self):
return torch.zeros(1, 1, self.hidden_size)

def train(model, criterion, optimizer, dataloader, n_epochs):
for epoch in range(n_epochs):
total_loss = 0
for cas, smiles in dataloader:
hidden = model.init_hidden()
optimizer.zero_grad()
loss = 0

for i in range(smiles.size(1)):
output, hidden = model(cas[0], hidden)
loss += criterion(output, smiles[0][i].unsqueeze(0))

loss.backward()
optimizer.step()
total_loss += loss.item()

print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss/len(dataloader):.4f}")

def predict(model, cas):
with torch.no_grad():
hidden = model.init_hidden()
cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
for i, char in enumerate(cas):
cas_tensor[i] = cas_char_to_idx[char]

predicted_smiles = ""
for _ in range(MAX_SMILES_LEN):
output, hidden = model(cas_tensor, hidden)
predicted_idx = output.argmax().item()
if predicted_idx == smiles_char_to_idx['']: # End of sequence
break
predicted_char = smiles_chars[predicted_idx]
predicted_smiles += predicted_char

return predicted_smiles

if __name__ == "__main__":

cas_list = ["50-00-0", "64-17-5", "67-64-1"]
smiles_list = ["C=O", "CCO", "CC(=O)C"]

dataset = CAStoSMILESDataset(cas_list, smiles_list)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

model = CAStoSMILESRNN(len(cas_chars), 128, len(smiles_chars))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

train(model, criterion, optimizer, dataloader, n_epochs=100)

test_cas = "71-43-2" # Benzene
predicted_smiles = predict(model, test_cas)
print(f"CAS: {test_cas}")
print(f"Predicted SMILES: {predicted_smiles}")

We would need to extend the SMILES character set to include the full grammar and possibly find other things. This would need to be more well thought out and a plethora more edge cases. Just a thought and I will leave you with that.

Happy Cheminformatics!

--

--

Sulstice
Sulstice

No responses yet