alimotahharynia commited on
Commit
2b75b49
·
verified ·
1 Parent(s): c461f62

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -108
README.md CHANGED
@@ -28,116 +28,41 @@ DrugGen is a GPT-2 based model specialized for generating drug-like SMILES struc
28
  - Model Sources: liyuesen/druggpt
29
 
30
  ## How to Get Started with the Model
31
- ```python
32
- import pandas as pd
33
- from transformers import AutoTokenizer, GPT2LMHeadModel
34
- from datasets import load_dataset
35
-
36
- class SMILESGenerator:
37
- def __init__(self):
38
-
39
- # Configuration parameters
40
- self.config = {
41
- "model_name": "alimotahharynia/DrugGen",
42
- "dataset_name": "alimotahharynia/approved_drug_target",
43
- "dataset_key": "uniprot_sequence",
44
- "generation_kwargs": {
45
- "do_sample": True,
46
- "top_k": 9,
47
- "max_length": 1024,
48
- "top_p": 0.9,
49
- "num_return_sequences": 10
50
- },
51
- "max_retries": 30 # Max retry limit to avoid infinite loops
52
- }
53
-
54
- # Load model and tokenizer
55
- self.model_name = self.config["model_name"]
56
- self.model, self.tokenizer = self.load_model_and_tokenizer(self.model_name)
57
-
58
- # Load UniProt mapping dataset
59
- dataset_name = self.config["dataset_name"]
60
- dataset_key = self.config["dataset_key"]
61
- self.uniprot_to_sequence = self.load_uniprot_mapping(dataset_name, dataset_key)
62
-
63
- # Adjust generation parameters with token IDs
64
- self.generation_kwargs = self.config["generation_kwargs"]
65
- self.generation_kwargs["bos_token_id"] = self.tokenizer.bos_token_id
66
- self.generation_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
67
- self.generation_kwargs["pad_token_id"] = self.tokenizer.pad_token_id
68
-
69
- def load_model_and_tokenizer(self, model_name):
70
-
71
- print(f"Loading model and tokenizer: {model_name}")
72
- tokenizer = AutoTokenizer.from_pretrained(model_name)
73
- model = GPT2LMHeadModel.from_pretrained(model_name)
74
- return model, tokenizer
75
-
76
- def load_uniprot_mapping(self, dataset_name, dataset_key):
77
-
78
- print(f"Loading dataset: {dataset_name}")
79
- try:
80
- dataset = load_dataset(dataset_name, dataset_key)
81
- return {row["UniProt_id"]: row["Sequence"] for row in dataset["uniprot_seq"]}
82
- except Exception as e:
83
- raise RuntimeError(f"Failed to load dataset {dataset_name}: {e}")
84
-
85
- def generate_smiles(self, sequence, num_generated):
86
- """
87
- Generate unique SMILES with a retry limit to avoid infinite loops.
88
- """
89
- generated_smiles_set = set()
90
- prompt = f"<|startoftext|><P>{sequence}<L>"
91
- encoded_prompt = self.tokenizer(prompt, return_tensors="pt")["input_ids"]
92
- retries = 0
93
-
94
- while len(generated_smiles_set) < num_generated:
95
- if retries >= self.config["max_retries"]:
96
- print("Max retries reached. Returning what has been generated so far.")
97
- break
98
-
99
- sample_outputs = self.model.generate(encoded_prompt, **self.generation_kwargs)
100
- for sample_output in sample_outputs:
101
- output_decode = self.tokenizer.decode(sample_output, skip_special_tokens=False)
102
- try:
103
- generated_smiles = output_decode.split("<L>")[1].split("<|endoftext|>")[0]
104
- if generated_smiles not in generated_smiles_set:
105
- generated_smiles_set.add(generated_smiles)
106
- except IndexError:
107
- continue
108
-
109
- retries += 1
110
-
111
- return list(generated_smiles_set)
112
-
113
- def generate_smiles_data(self, list_of_sequences=None, list_of_uniprot_ids=None, num_generated=10):
114
- """
115
- Generate SMILES data for sequences or UniProt IDs.
116
- """
117
- if not list_of_sequences and not list_of_uniprot_ids:
118
- raise ValueError("Either `list_of_sequences` or `list_of_uniprot_ids` must be provided.")
119
-
120
- # Prepare sequences input
121
- if list_of_sequences:
122
- sequences_input = list_of_sequences
123
- else:
124
- sequences_input = [
125
- self.uniprot_to_sequence[uid]
126
- for uid in list_of_uniprot_ids
127
- if uid in self.uniprot_to_sequence
128
- ]
129
-
130
- data = []
131
- for sequence in sequences_input:
132
- smiles = self.generate_smiles(sequence, num_generated)
133
- uniprot_id = next((uid for uid, seq in self.uniprot_to_sequence.items() if seq == sequence), None)
134
- data.append({"UniProt_id": uniprot_id, "sequence": sequence, "smiles": smiles})
135
-
136
- return pd.DataFrame(data)
137
  ```
138
- Below is an example of how to use DrugGen for generating SMILES. Adjust the `num_generated` parameter to specify the number of unique protein SMILES you wish to generate.
 
 
 
 
 
 
 
 
139
  ```python
 
 
140
  if __name__ == "__main__":
 
141
  # Initialize the generator
142
  generator = SMILESGenerator()
143
 
@@ -148,7 +73,7 @@ if __name__ == "__main__":
148
  list_of_uniprot_ids = ["P12821", "P37231"]
149
 
150
  # Generate SMILES data for sequences
151
- # df = generator.generate_smiles_data(list_of_sequences=list_of_sequences, num_generated=2)
152
 
153
  # Generate SMILES data for UniProt IDs
154
  df = generator.generate_smiles_data(list_of_uniprot_ids=list_of_uniprot_ids, num_generated=2)
 
28
  - Model Sources: liyuesen/druggpt
29
 
30
  ## How to Get Started with the Model
31
+ DrugGen can be used via command-line interface (CLI) or integration into Python scripts.
32
+
33
+ ### Installation
34
+ Clone the repository and navigate to its directory:
35
+ ```bash
36
+ git clone https://github.com/mahsasheikh/DrugGen.git
37
+ cd DrugGen
38
+ ```
39
+
40
+ ### Command-Line Interface
41
+ DrugGen provides a CLI to generate SMILES structures based on UniProt IDs, protein sequences, or both.
42
+
43
+ #### Generating SMILES Structures
44
+ ```bash
45
+ python3 drugGen_generator_cli.py --uniprot_ids <UniProt_IDs> --sequences <Protein_Sequences> --num_generated <Number_of_Structures> --output_file <Output_File_Name>
46
+ ```
47
+
48
+ #### Example Command
49
+ ```bash
50
+ python3 drugGen_generator_cli.py --uniprot_ids P12821 P37231 --sequences "MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGAQLFAQSYNSSAEQVLFQSVAASWAHDTNITAENARRQEEAALLSQEFAEAWGQKAKELYEPIWQNFTDPQLRRIIGAVRTLGSANLPLAKRQQYNALLSNMSRIYSTAKVCLPNKTATCWSLDPDLTNILASSRSYAMLLFAWEGWHNAAGIPLKPLYEDFTALSNEAYKQDGFTDTGAYWRSWYNSPTFEDDLEHLYQQLEPLYLNLHAFVRRALHRRYGDRYINLRGPIPAHLLGDMWAQSWENIYDMVVPFPDKPNLDVTSTMLQQGWNATHMFRVAEEFFTSLELSPMPPEFWEGSMLEKPADGREVVCHASAWDFYNRKDFRIKQCTRVTMDQLSTVHHEMGHIQYYLQYKDLPVSLRRGANPGFHEAIGDVLALSVSTPEHLHKIGLLDRVTNDTESDINYLLKMALEKIAFLPFGYLVDQWRWGVFSGRTPPSRYNFDWWYLRTKYQGICPPVTRNETHFDAGAKFHVPNVTPYIRYFVSFVLQFQFHEALCKEAGYEGPLHQCDIYRSTKAGAKLRKVLQAGSSRPWQEVLKDMVGLDALDAQPLLKYFQPVTQWLQEQNQQNGEVLGWPEYQWHPPLPDNYPEGIDLVTDEAEASKFVEEYDRTSQVVWNEYAEANWNYNTNITTETSKILLQKNMQIANHTLKYGTQARKFDVNQLQNTTIKRIIKKVQDLERAALPAQELEEYNKILLDMETTYSVATVCHPNGSCLQLEPDLTNVMATSRKYEDLLWAWEGWRDKAGRAILQFYPKYVELINQAARLNGYVDAGDSWRSMYETPSLEQDLERLFQELQPLYLNLHAYVRRALHRHYGAQHINLEGPIPAHLLGNMWAQTWSNIYDLVVPFPSAPSMDTTEAMLKQGWTPRRMFKEADDFFTSLGLLPVPPEFWNKSMLEKPTDGREVVCHASAWDFYNGKDFRIKQCTTVNLEDLVVAHHEMGHIQYFMQYKDLPVALREGANPGFHEAIGDVLALSVSTPKHLHSLNLLSSEGGSDEHDINFLMKMALDKIAFIPFSYLVDQWRWRVFDGSITKENYNQEWWSLRLKYQGLCPPVPRTQGDFDPGAKFHIPSSVPYIRYFVSFIIQFQFHEALCQAAGHTGPLHKCDIYQSKEAGQRLATAMKLGFSRPWPEAMQLITGQPNMSASAMLSYFKPLLDWLRTENELHGEKLGWPQYNWTPNSARSEGPLPDSGRVSFLGLDLDAQQARVGQWLLLFLGIALLVATLGLSQRLFSIRHRSLHRHSHGPQFGSEVELRHS" --num_generated 10 --output_file g_smiles_test.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  ```
52
+ #### Parameters
53
+ - uniprot_ids: Space-separated UniProt IDs.
54
+ - sequences: Space-seperated protein sequences in string format.
55
+ - num_generated: Number of unique SMILES structures to generate.
56
+ - output_file: Name of the output file to save the generated structures.
57
+
58
+
59
+ ### Python Integration
60
+ Adjust the `num_generated` parameter to specify the number of unique protein SMILES you wish to generate.
61
  ```python
62
+ from drugGen_generator import SMILESGenerator
63
+
64
  if __name__ == "__main__":
65
+
66
  # Initialize the generator
67
  generator = SMILESGenerator()
68
 
 
73
  list_of_uniprot_ids = ["P12821", "P37231"]
74
 
75
  # Generate SMILES data for sequences
76
+ # df = generator.generate_smiles_data(list_of_sequences=list_of_sequences, num_generated=10)
77
 
78
  # Generate SMILES data for UniProt IDs
79
  df = generator.generate_smiles_data(list_of_uniprot_ids=list_of_uniprot_ids, num_generated=2)