Convert a List of CAS numbers to SMILES using Pubmed, CirPy, or a Recurrent Neural Network.
I get asked this question a lot. Which seems to be a little odd but here we go. To convert a list of CAS
numbers to SMILES we can do three methods: pubmed
or cirpy
or write a recurrent neural network to do it for us. CAS numbers as detailed by the diagram are 2–6 digits followed by a hypen for 2 digits and another hyphen than 1 digit.
Method 1: Pubmed
Using the PUG Rest API, we can fetch a SMILES pretty easily however sometimes some values might not exist and you would need to extend to other databases. I have found cirpy
to be more flexible in that manner.
import requests
import pandas as pd
def cas_to_smiles(cas):
if not is_valid_cas(cas):
return "Invalid CAS number format"
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas}/property/IsomericSMILES/JSON"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
smiles = data['PropertyTable']['Properties'][0]['IsomericSMILES']
return smiles
except requests.exceptions.RequestException as e:
return f"Error fetching data: {e}"
except (KeyError, IndexError):
return "CAS number not found or SMILES not available"
if __name__ == "__main__":
cas_numbers = [
"50-00-0", # Formaldehyde
"67-64-1", # Acetone
"71-43-2", # Benzene
"64-17-5", # Ethanol
"7732-18-5" # Water
]
molecules = [ cas_to_smiles(cas) for cas in cas_numbers ]
df = pd.DataFrame()
df['CAS'] = cas_numbers
df['SMILES'] = molecules
output_file = 'cas_to_smiles_results.csv'
df.to_csv(output_file, index=False)
print("\nResults:")
print(df)
Method 2: Cirpy
The popular software cirpy is used a lot for chemical translation and is pretty flexible in finding cross references of names. I’ve always found it useful except sometimes you can get timeout requests:
import requests
import pandas as pd
def cas_to_smiles(cas_number):
url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"
try:
response = requests.get(url)
response.raise_for_status()
smiles = response.text.strip()
return smiles if smiles else None
except requests.exceptions.RequestException as e:
print(f"Error fetching data for CAS {cas_number}: {e}")
return None
if __name__ == "__main__":
cas_numbers = [
"50-00-0", # Formaldehyde
"67-64-1", # Acetone
"71-43-2", # Benzene
"64-17-5", # Ethanol
"7732-18-5" # Water
]
print("Converting CAS numbers to SMILES...")
molecules = [ cas_to_smiles(cas) for cas in cas_numbers ]
df = pd.DataFrame()
df['CAS'] = cas_numbers
df['SMILES'] = molecules
print("\nResults:")
print(df)
Method 3: Recurrent Neural Network
This would just be the initial implementation but for something more robust down the line we can implement a recurrent neural network. CAS numbers have a history and some CAS numbers could be the same molecule. It’s a little tough to predict. However, if there is some organization or rules behind it maybe an AI program could find out. This is just a simplistic example.
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
cas_chars = '0123456789-'
smiles_chars = 'C(=O)c1234567890[]' # Simplified SMILES character set
cas_char_to_idx = {char: idx for idx, char in enumerate(cas_chars)}
smiles_char_to_idx = {char: idx for idx, char in enumerate(smiles_chars)}
MAX_CAS_LEN = 12 # Maximum length of CAS number (e.g., "12345-67-8")
MAX_SMILES_LEN = 50 # Maximum length of SMILES string
class CAStoSMILESDataset(Dataset):
def __init__(self, cas_list, smiles_list):
self.cas_list = cas_list
self.smiles_list = smiles_list
def __len__(self):
return len(self.cas_list)
def __getitem__(self, idx):
cas = self.cas_list[idx]
smiles = self.smiles_list[idx]
# Convert CAS to tensor
cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
for i, char in enumerate(cas):
cas_tensor[i] = cas_char_to_idx[char]
# Convert SMILES to tensor
smiles_tensor = torch.zeros(MAX_SMILES_LEN, dtype=torch.long)
for i, char in enumerate(smiles):
smiles_tensor[i] = smiles_char_to_idx[char]
return cas_tensor, smiles_tensor
class CAStoSMILESRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(CAStoSMILESRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
output = self.out(output.view(1, -1))
return output, hidden
def init_hidden(self):
return torch.zeros(1, 1, self.hidden_size)
def train(model, criterion, optimizer, dataloader, n_epochs):
for epoch in range(n_epochs):
total_loss = 0
for cas, smiles in dataloader:
hidden = model.init_hidden()
optimizer.zero_grad()
loss = 0
for i in range(smiles.size(1)):
output, hidden = model(cas[0], hidden)
loss += criterion(output, smiles[0][i].unsqueeze(0))
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss/len(dataloader):.4f}")
def predict(model, cas):
with torch.no_grad():
hidden = model.init_hidden()
cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
for i, char in enumerate(cas):
cas_tensor[i] = cas_char_to_idx[char]
predicted_smiles = ""
for _ in range(MAX_SMILES_LEN):
output, hidden = model(cas_tensor, hidden)
predicted_idx = output.argmax().item()
if predicted_idx == smiles_char_to_idx['']: # End of sequence
break
predicted_char = smiles_chars[predicted_idx]
predicted_smiles += predicted_char
return predicted_smiles
if __name__ == "__main__":
cas_list = ["50-00-0", "64-17-5", "67-64-1"]
smiles_list = ["C=O", "CCO", "CC(=O)C"]
dataset = CAStoSMILESDataset(cas_list, smiles_list)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
model = CAStoSMILESRNN(len(cas_chars), 128, len(smiles_chars))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
train(model, criterion, optimizer, dataloader, n_epochs=100)
test_cas = "71-43-2" # Benzene
predicted_smiles = predict(model, test_cas)
print(f"CAS: {test_cas}")
print(f"Predicted SMILES: {predicted_smiles}")
We would need to extend the SMILES character set to include the full grammar and possibly find other things. This would need to be more well thought out and a plethora more edge cases. Just a thought and I will leave you with that.
Happy Cheminformatics!