Convert a List of CAS numbers to SMILES using Pubmed, CirPy, or a Recurrent Neural Network.

4 min readSep 13, 2024

I get asked this question a lot. Which seems to be a little odd but here we go. To convert a list of CAS numbers to SMILES we can do three methods: pubmed or cirpy or write a recurrent neural network to do it for us. CAS numbers as detailed by the diagram are 2–6 digits followed by a hypen for 2 digits and another hyphen than 1 digit.

Method 1: Pubmed

Using the PUG Rest API, we can fetch a SMILES pretty easily however sometimes some values might not exist and you would need to extend to other databases. I have found cirpy to be more flexible in that manner.

import requests
import pandas as pd

def cas_to_smiles(cas):

    if not is_valid_cas(cas):
        return "Invalid CAS number format"
    
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas}/property/IsomericSMILES/JSON"
    
    try:
        response = requests.get(url)
        response.raise_for_status() 
        data = response.json()
        
        smiles = data['PropertyTable']['Properties'][0]['IsomericSMILES']
        return smiles
    except requests.exceptions.RequestException as e:
        return f"Error fetching data: {e}"
    except (KeyError, IndexError):
        return "CAS number not found or SMILES not available"

if __name__ == "__main__":

    cas_numbers = [
        "50-00-0",  # Formaldehyde
        "67-64-1",  # Acetone
        "71-43-2",  # Benzene
        "64-17-5",  # Ethanol
        "7732-18-5"  # Water
    ]
    
    molecules = [ cas_to_smiles(cas) for cas in cas_numbers ] 
    
    df = pd.DataFrame()
    df['CAS'] = cas_numbers
    df['SMILES'] = molecules
    
    output_file = 'cas_to_smiles_results.csv'
    df.to_csv(output_file, index=False)
    
    print("\nResults:")
    print(df)

Method 2: Cirpy

The popular software cirpy is used a lot for chemical translation and is pretty flexible in finding cross references of names. I’ve always found it useful except sometimes you can get timeout requests:

import requests
import pandas as pd

def cas_to_smiles(cas_number):
  
    url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        smiles = response.text.strip()
        return smiles if smiles else None
      
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for CAS {cas_number}: {e}")
        return None

if __name__ == "__main__":

    cas_numbers = [
        "50-00-0",  # Formaldehyde
        "67-64-1",  # Acetone
        "71-43-2",  # Benzene
        "64-17-5",  # Ethanol
        "7732-18-5"  # Water
    ]
    
    print("Converting CAS numbers to SMILES...")
    molecules = [ cas_to_smiles(cas) for cas in cas_numbers ] 
    
    df = pd.DataFrame()
    df['CAS'] = cas_numbers
    df['SMILES'] = molecules

    print("\nResults:")
    print(df)

Method 3: Recurrent Neural Network

This would just be the initial implementation but for something more robust down the line we can implement a recurrent neural network. CAS numbers have a history and some CAS numbers could be the same molecule. It’s a little tough to predict. However, if there is some organization or rules behind it maybe an AI program could find out. This is just a simplistic example.

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

cas_chars = '0123456789-'
smiles_chars = 'C(=O)c1234567890[]'  # Simplified SMILES character set

cas_char_to_idx = {char: idx for idx, char in enumerate(cas_chars)}
smiles_char_to_idx = {char: idx for idx, char in enumerate(smiles_chars)}

MAX_CAS_LEN = 12  # Maximum length of CAS number (e.g., "12345-67-8")
MAX_SMILES_LEN = 50  # Maximum length of SMILES string

class CAStoSMILESDataset(Dataset):
    def __init__(self, cas_list, smiles_list):
        self.cas_list = cas_list
        self.smiles_list = smiles_list
    
    def __len__(self):
        return len(self.cas_list)
    
    def __getitem__(self, idx):
        cas = self.cas_list[idx]
        smiles = self.smiles_list[idx]
        
        # Convert CAS to tensor
        cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
        for i, char in enumerate(cas):
            cas_tensor[i] = cas_char_to_idx[char]
        
        # Convert SMILES to tensor
        smiles_tensor = torch.zeros(MAX_SMILES_LEN, dtype=torch.long)
        for i, char in enumerate(smiles):
            smiles_tensor[i] = smiles_char_to_idx[char]
        
        return cas_tensor, smiles_tensor

class CAStoSMILESRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CAStoSMILESRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(output.view(1, -1))
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size)

def train(model, criterion, optimizer, dataloader, n_epochs):
    for epoch in range(n_epochs):
        total_loss = 0
        for cas, smiles in dataloader:
            hidden = model.init_hidden()
            optimizer.zero_grad()
            loss = 0
            
            for i in range(smiles.size(1)):
                output, hidden = model(cas[0], hidden)
                loss += criterion(output, smiles[0][i].unsqueeze(0))
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss/len(dataloader):.4f}")

def predict(model, cas):
    with torch.no_grad():
        hidden = model.init_hidden()
        cas_tensor = torch.zeros(MAX_CAS_LEN, dtype=torch.long)
        for i, char in enumerate(cas):
            cas_tensor[i] = cas_char_to_idx[char]
        
        predicted_smiles = ""
        for _ in range(MAX_SMILES_LEN):
            output, hidden = model(cas_tensor, hidden)
            predicted_idx = output.argmax().item()
            if predicted_idx == smiles_char_to_idx['']:  # End of sequence
                break
            predicted_char = smiles_chars[predicted_idx]
            predicted_smiles += predicted_char
        
        return predicted_smiles

if __name__ == "__main__":

    cas_list = ["50-00-0", "64-17-5", "67-64-1"]
    smiles_list = ["C=O", "CCO", "CC(=O)C"]
    
    dataset = CAStoSMILESDataset(cas_list, smiles_list)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    
    model = CAStoSMILESRNN(len(cas_chars), 128, len(smiles_chars))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    
    train(model, criterion, optimizer, dataloader, n_epochs=100)
    
    test_cas = "71-43-2"  # Benzene
    predicted_smiles = predict(model, test_cas)
    print(f"CAS: {test_cas}")
    print(f"Predicted SMILES: {predicted_smiles}")

We would need to extend the SMILES character set to include the full grammar and possibly find other things. This would need to be more well thought out and a plethora more edge cases. Just a thought and I will leave you with that.

Happy Cheminformatics!

Convert a List of CAS numbers to SMILES using Pubmed, CirPy, or a Recurrent Neural Network.

Sign up to discover human stories that deepen your understanding of the world.

Free

Membership

Written by Sulstice

No responses yet