alimotahharynia
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -28,116 +28,41 @@ DrugGen is a GPT-2 based model specialized for generating drug-like SMILES struc
|
|
28 |
- Model Sources: liyuesen/druggpt
|
29 |
|
30 |
## How to Get Started with the Model
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
"max_retries": 30 # Max retry limit to avoid infinite loops
|
52 |
-
}
|
53 |
-
|
54 |
-
# Load model and tokenizer
|
55 |
-
self.model_name = self.config["model_name"]
|
56 |
-
self.model, self.tokenizer = self.load_model_and_tokenizer(self.model_name)
|
57 |
-
|
58 |
-
# Load UniProt mapping dataset
|
59 |
-
dataset_name = self.config["dataset_name"]
|
60 |
-
dataset_key = self.config["dataset_key"]
|
61 |
-
self.uniprot_to_sequence = self.load_uniprot_mapping(dataset_name, dataset_key)
|
62 |
-
|
63 |
-
# Adjust generation parameters with token IDs
|
64 |
-
self.generation_kwargs = self.config["generation_kwargs"]
|
65 |
-
self.generation_kwargs["bos_token_id"] = self.tokenizer.bos_token_id
|
66 |
-
self.generation_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
|
67 |
-
self.generation_kwargs["pad_token_id"] = self.tokenizer.pad_token_id
|
68 |
-
|
69 |
-
def load_model_and_tokenizer(self, model_name):
|
70 |
-
|
71 |
-
print(f"Loading model and tokenizer: {model_name}")
|
72 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
73 |
-
model = GPT2LMHeadModel.from_pretrained(model_name)
|
74 |
-
return model, tokenizer
|
75 |
-
|
76 |
-
def load_uniprot_mapping(self, dataset_name, dataset_key):
|
77 |
-
|
78 |
-
print(f"Loading dataset: {dataset_name}")
|
79 |
-
try:
|
80 |
-
dataset = load_dataset(dataset_name, dataset_key)
|
81 |
-
return {row["UniProt_id"]: row["Sequence"] for row in dataset["uniprot_seq"]}
|
82 |
-
except Exception as e:
|
83 |
-
raise RuntimeError(f"Failed to load dataset {dataset_name}: {e}")
|
84 |
-
|
85 |
-
def generate_smiles(self, sequence, num_generated):
|
86 |
-
"""
|
87 |
-
Generate unique SMILES with a retry limit to avoid infinite loops.
|
88 |
-
"""
|
89 |
-
generated_smiles_set = set()
|
90 |
-
prompt = f"<|startoftext|><P>{sequence}<L>"
|
91 |
-
encoded_prompt = self.tokenizer(prompt, return_tensors="pt")["input_ids"]
|
92 |
-
retries = 0
|
93 |
-
|
94 |
-
while len(generated_smiles_set) < num_generated:
|
95 |
-
if retries >= self.config["max_retries"]:
|
96 |
-
print("Max retries reached. Returning what has been generated so far.")
|
97 |
-
break
|
98 |
-
|
99 |
-
sample_outputs = self.model.generate(encoded_prompt, **self.generation_kwargs)
|
100 |
-
for sample_output in sample_outputs:
|
101 |
-
output_decode = self.tokenizer.decode(sample_output, skip_special_tokens=False)
|
102 |
-
try:
|
103 |
-
generated_smiles = output_decode.split("<L>")[1].split("<|endoftext|>")[0]
|
104 |
-
if generated_smiles not in generated_smiles_set:
|
105 |
-
generated_smiles_set.add(generated_smiles)
|
106 |
-
except IndexError:
|
107 |
-
continue
|
108 |
-
|
109 |
-
retries += 1
|
110 |
-
|
111 |
-
return list(generated_smiles_set)
|
112 |
-
|
113 |
-
def generate_smiles_data(self, list_of_sequences=None, list_of_uniprot_ids=None, num_generated=10):
|
114 |
-
"""
|
115 |
-
Generate SMILES data for sequences or UniProt IDs.
|
116 |
-
"""
|
117 |
-
if not list_of_sequences and not list_of_uniprot_ids:
|
118 |
-
raise ValueError("Either `list_of_sequences` or `list_of_uniprot_ids` must be provided.")
|
119 |
-
|
120 |
-
# Prepare sequences input
|
121 |
-
if list_of_sequences:
|
122 |
-
sequences_input = list_of_sequences
|
123 |
-
else:
|
124 |
-
sequences_input = [
|
125 |
-
self.uniprot_to_sequence[uid]
|
126 |
-
for uid in list_of_uniprot_ids
|
127 |
-
if uid in self.uniprot_to_sequence
|
128 |
-
]
|
129 |
-
|
130 |
-
data = []
|
131 |
-
for sequence in sequences_input:
|
132 |
-
smiles = self.generate_smiles(sequence, num_generated)
|
133 |
-
uniprot_id = next((uid for uid, seq in self.uniprot_to_sequence.items() if seq == sequence), None)
|
134 |
-
data.append({"UniProt_id": uniprot_id, "sequence": sequence, "smiles": smiles})
|
135 |
-
|
136 |
-
return pd.DataFrame(data)
|
137 |
```
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
```python
|
|
|
|
|
140 |
if __name__ == "__main__":
|
|
|
141 |
# Initialize the generator
|
142 |
generator = SMILESGenerator()
|
143 |
|
@@ -148,7 +73,7 @@ if __name__ == "__main__":
|
|
148 |
list_of_uniprot_ids = ["P12821", "P37231"]
|
149 |
|
150 |
# Generate SMILES data for sequences
|
151 |
-
# df = generator.generate_smiles_data(list_of_sequences=list_of_sequences, num_generated=
|
152 |
|
153 |
# Generate SMILES data for UniProt IDs
|
154 |
df = generator.generate_smiles_data(list_of_uniprot_ids=list_of_uniprot_ids, num_generated=2)
|
|
|
28 |
- Model Sources: liyuesen/druggpt
|
29 |
|
30 |
## How to Get Started with the Model
|
31 |
+
DrugGen can be used via command-line interface (CLI) or integration into Python scripts.
|
32 |
+
|
33 |
+
### Installation
|
34 |
+
Clone the repository and navigate to its directory:
|
35 |
+
```bash
|
36 |
+
git clone https://github.com/mahsasheikh/DrugGen.git
|
37 |
+
cd DrugGen
|
38 |
+
```
|
39 |
+
|
40 |
+
### Command-Line Interface
|
41 |
+
DrugGen provides a CLI to generate SMILES structures based on UniProt IDs, protein sequences, or both.
|
42 |
+
|
43 |
+
#### Generating SMILES Structures
|
44 |
+
```bash
|
45 |
+
python3 drugGen_generator_cli.py --uniprot_ids <UniProt_IDs> --sequences <Protein_Sequences> --num_generated <Number_of_Structures> --output_file <Output_File_Name>
|
46 |
+
```
|
47 |
+
|
48 |
+
#### Example Command
|
49 |
+
```bash
|
50 |
+
python3 drugGen_generator_cli.py --uniprot_ids P12821 P37231 --sequences "MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGAQLFAQSYNSSAEQVLFQSVAASWAHDTNITAENARRQEEAALLSQEFAEAWGQKAKELYEPIWQNFTDPQLRRIIGAVRTLGSANLPLAKRQQYNALLSNMSRIYSTAKVCLPNKTATCWSLDPDLTNILASSRSYAMLLFAWEGWHNAAGIPLKPLYEDFTALSNEAYKQDGFTDTGAYWRSWYNSPTFEDDLEHLYQQLEPLYLNLHAFVRRALHRRYGDRYINLRGPIPAHLLGDMWAQSWENIYDMVVPFPDKPNLDVTSTMLQQGWNATHMFRVAEEFFTSLELSPMPPEFWEGSMLEKPADGREVVCHASAWDFYNRKDFRIKQCTRVTMDQLSTVHHEMGHIQYYLQYKDLPVSLRRGANPGFHEAIGDVLALSVSTPEHLHKIGLLDRVTNDTESDINYLLKMALEKIAFLPFGYLVDQWRWGVFSGRTPPSRYNFDWWYLRTKYQGICPPVTRNETHFDAGAKFHVPNVTPYIRYFVSFVLQFQFHEALCKEAGYEGPLHQCDIYRSTKAGAKLRKVLQAGSSRPWQEVLKDMVGLDALDAQPLLKYFQPVTQWLQEQNQQNGEVLGWPEYQWHPPLPDNYPEGIDLVTDEAEASKFVEEYDRTSQVVWNEYAEANWNYNTNITTETSKILLQKNMQIANHTLKYGTQARKFDVNQLQNTTIKRIIKKVQDLERAALPAQELEEYNKILLDMETTYSVATVCHPNGSCLQLEPDLTNVMATSRKYEDLLWAWEGWRDKAGRAILQFYPKYVELINQAARLNGYVDAGDSWRSMYETPSLEQDLERLFQELQPLYLNLHAYVRRALHRHYGAQHINLEGPIPAHLLGNMWAQTWSNIYDLVVPFPSAPSMDTTEAMLKQGWTPRRMFKEADDFFTSLGLLPVPPEFWNKSMLEKPTDGREVVCHASAWDFYNGKDFRIKQCTTVNLEDLVVAHHEMGHIQYFMQYKDLPVALREGANPGFHEAIGDVLALSVSTPKHLHSLNLLSSEGGSDEHDINFLMKMALDKIAFIPFSYLVDQWRWRVFDGSITKENYNQEWWSLRLKYQGLCPPVPRTQGDFDPGAKFHIPSSVPYIRYFVSFIIQFQFHEALCQAAGHTGPLHKCDIYQSKEAGQRLATAMKLGFSRPWPEAMQLITGQPNMSASAMLSYFKPLLDWLRTENELHGEKLGWPQYNWTPNSARSEGPLPDSGRVSFLGLDLDAQQARVGQWLLLFLGIALLVATLGLSQRLFSIRHRSLHRHSHGPQFGSEVELRHS" --num_generated 10 --output_file g_smiles_test.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
```
|
52 |
+
#### Parameters
|
53 |
+
- uniprot_ids: Space-separated UniProt IDs.
|
54 |
+
- sequences: Space-seperated protein sequences in string format.
|
55 |
+
- num_generated: Number of unique SMILES structures to generate.
|
56 |
+
- output_file: Name of the output file to save the generated structures.
|
57 |
+
|
58 |
+
|
59 |
+
### Python Integration
|
60 |
+
Adjust the `num_generated` parameter to specify the number of unique protein SMILES you wish to generate.
|
61 |
```python
|
62 |
+
from drugGen_generator import SMILESGenerator
|
63 |
+
|
64 |
if __name__ == "__main__":
|
65 |
+
|
66 |
# Initialize the generator
|
67 |
generator = SMILESGenerator()
|
68 |
|
|
|
73 |
list_of_uniprot_ids = ["P12821", "P37231"]
|
74 |
|
75 |
# Generate SMILES data for sequences
|
76 |
+
# df = generator.generate_smiles_data(list_of_sequences=list_of_sequences, num_generated=10)
|
77 |
|
78 |
# Generate SMILES data for UniProt IDs
|
79 |
df = generator.generate_smiles_data(list_of_uniprot_ids=list_of_uniprot_ids, num_generated=2)
|