Spaces:

msdkhairi
/

math2latex

Paused

App Files Files Community

msdkhairi commited on Oct 9, 2024

Commit

bf9d0ba

1 Parent(s): da1d27e

Initial Commit

Browse files

Files changed (13) hide show

.gitattributes +1 -0
app.py +102 -0
checkpoints/model.ckpt +3 -0
dataset/formula_images_processed/2b891b21ac.png +0 -0
dataset/formula_images_processed/78228211ca.png +0 -0
dataset/formula_images_processed/a8ec0c091c.png +0 -0
dataset/im2latex_formulas.norm.processed.lst +3 -0
math2latex/data/__init__.py +4 -0
math2latex/data/tokenizer.py +81 -0
math2latex/model/__init__.py +1 -0
math2latex/model/positional_encoding.py +45 -0
math2latex/model/transformer.py +120 -0
requirements.txt +13 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.lst filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+import torchvision.transforms as transforms
+import matplotlib
+import matplotlib.pyplot as plt
+import gradio as gr
+from math2latex.data import Tokenizer
+from math2latex.model import ResNetTransformer
+# Global variables to hold the setup components
+model, tokenizer = None, None
+def get_formulas(filename):
+    with open(filename, 'r') as f:
+        formulas = f.readlines()
+    return formulas
+def latex2image(latex_expression, image_name, image_size_in=(3, 0.6), fontsize=12, dpi=200):
+    # Runtime Configuration Parameters
+    matplotlib.rcParams["mathtext.fontset"] = "cm"  # Font changed to Computer Modern
+    # matplotlib.rcParams['text.usetex'] = True  # Use LaTeX to write all text
+    fig = plt.figure(figsize=image_size_in, dpi=dpi)
+    text = fig.text(
+        x=0.5,
+        y=0.5,
+        s=latex_expression,
+        horizontalalignment="center",
+        verticalalignment="center",
+        fontsize=fontsize,
+    )
+    plt.savefig(image_name)
+    plt.close(fig)
+def setup():
+    global model, tokenizer
+    # setup the model
+    checkpoint_path = 'checkpoints/model.ckpt'
+    model = ResNetTransformer()
+    state_dict = torch.load(checkpoint_path, map_location='cpu')['state_dict']
+    state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
+    model.load_state_dict(state_dict)
+    model.to("cpu")
+    model.eval()
+    # # setup the tokenizer
+    formulas = get_formulas('dataset/im2latex_formulas.norm.processed.lst')
+    tokenizer = Tokenizer(formulas)
+def predict_image(image):
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        setup()
+    transform = transforms.ToTensor()
+    image = transform(image)
+    image = image.unsqueeze(0)
+    with torch.no_grad():
+        output = model.predict(image)
+    tokens = tokenizer.decode(output[0].tolist())
+    return tokens
+def predict_and_convert_to_image(image):
+    latex_code = predict_image(image)
+    image_name = 'temp.png'
+    latex_code_modified = latex_code.replace(" ", "")  # Remove spaces from the LaTeX code
+    latex_code_modified = rf"""${latex_code_modified}$"""
+    latex2image(latex_code_modified, image_name)
+    # Return both the LaTeX code and the path of the generated image
+    return latex_code, image_name
+def main():
+    setup()
+    examples = [
+        ["dataset/formula_images_processed/78228211ca.png"],
+        ["dataset/formula_images_processed/2b891b21ac.png"],
+        ["dataset/formula_images_processed/a8ec0c091c.png"],
+    ]
+    demo = gr.Interface(
+        fn=predict_and_convert_to_image,
+        inputs='image',
+        outputs=['text', 'image'],
+        # examples=examples,
+        title='Image to LaTeX code',
+        description='Convert an image of a mathematical formula to LaTeX code and view the result as an image. Upload an image of a formula to get both the LaTeX code and the corresponding image or use the examples provided.'
+    )
+    demo.launch()
+if __name__ == "__main__":
+    main()

checkpoints/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5a21e6e4249e1f7ade1623a66ec17446ab7f870c10accf378057e3c3a9ed2a4
+size 42532502

dataset/formula_images_processed/2b891b21ac.png ADDED Viewed

dataset/formula_images_processed/78228211ca.png ADDED Viewed

dataset/formula_images_processed/a8ec0c091c.png ADDED Viewed

dataset/im2latex_formulas.norm.processed.lst ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d90bc9a6fc69cfcc7c55690eb3f3e8fe8e821236888270db6a7265aea78ade1
+size 17711869

math2latex/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .prepare_data import prepare_data
+from .utils import get_formulas
+from .dataset import MathToLatexDataset, get_dataloader
+from .tokenizer import Tokenizer

math2latex/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import json
+from collections import Counter
+from torchtext.data.utils import get_tokenizer
+from torchtext.vocab import vocab
+class Tokenizer:
+    def __init__(self, formulas=None, max_len=150):
+        # self.tokenizer = get_tokenizer(None)
+        self.tokenizer = get_tokenizer("basic_english")
+        self.max_len = max_len
+        if formulas is not None:
+            self.vocab = self._build_vocab(formulas)
+            self.vocab.set_default_index(self.vocab['<unk>'])
+            self.pad_index = self.vocab['<pad>']
+            self.ignore_indices = {self.vocab['<pad>'], self.vocab['<bos>'], self.vocab['<eos>'], self.vocab['<unk>']}
+        else:
+            self.vocab = None
+    def _build_vocab(self, formulas):
+        counter = Counter()
+        for formula in formulas:
+            counter.update(self.tokenizer(formula))
+        return vocab(counter, specials=['<pad>', '<bos>', '<eos>', '<unk>'], min_freq=2)
+    def encode(self, formula, with_padding=False):
+        tokens = self.tokenizer(formula)
+        tokens = ['<bos>'] + tokens + ['<eos>']
+        if with_padding:
+            tokens = self.pad(tokens, self.max_len)
+        # add the bos and eos to begining and end of the tokens
+        return [self.vocab[token] for token in tokens]
+    def decode(self, indices):
+        return self.vocab.lookup_tokens(list(indices))
+    def decode_clean(self, indices):
+        # removes the ignore indices from the decoded tokens
+        cleaned_indices = [index for index in indices if int(index) not in self.ignore_indices]
+        # if self.vocab['<eos>'] in cleaned_indices:
+        #     cleaned_indices = cleaned_indices[:cleaned_indices.index(self.vocab['<eos>'])]
+        return self.vocab.lookup_tokens(cleaned_indices)
+    def decode_to_string(self, tokens):
+        # returns the decoded tokens as a string
+        decoded = self.decode_clean(tokens)
+        return ' '.join(decoded)
+    def pad(self, tokens, max_len):
+        if len(tokens) > max_len:
+            tokens = tokens[:max_len]
+            tokens[-1] = '<eos>'
+            return tokens
+        return tokens + ['<pad>'] * (max_len - len(tokens))
+    def save_vocab(self, file_path="dataset/tokenizer_vocab.json"):
+        # Save the list of tokens which reflects both `itos` and `stoi`
+        vocab_data = {
+            'itos': self.vocab.get_itos()
+        }
+        with open(file_path, 'w') as f:
+            json.dump(vocab_data, f)
+    def load_vocab(self, file_path):
+        with open(file_path, 'r') as f:
+            vocab_data = json.load(f)
+        # Reconstruct the vocabulary from the itos list
+        ordered_tokens = vocab_data['itos']
+        # Reconstruct the counter from the ordered list
+        counter = Counter({token: idx + 1 for idx, token in enumerate(ordered_tokens)})  # idx+1 to ensure non-zero freq
+        self.vocab = vocab(counter, specials=['<pad>', '<bos>', '<eos>', '<unk>'])
+        self.vocab.set_default_index(self.vocab['<unk>'])
+        self.pad_index = self.vocab['<pad>']
+        self.ignore_indices = {self.vocab['<pad>'], self.vocab['<bos>'], self.vocab['<eos>'], self.vocab['<unk>']}
+    def __len__(self):
+        return len(self.vocab)

math2latex/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .transformer import ResNetTransformer

math2latex/model/positional_encoding.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+class PositionalEncoding1D(nn.Module):
+    def __init__(self, d_model, max_len=1000, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.max_len = max_len
+        self.dropout = nn.Dropout(p=dropout)
+        self.encoding = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
+        self.encoding[:, 0::2] = torch.sin(position * div_term)
+        self.encoding[:, 1::2] = torch.cos(position * div_term)
+        self.encoding = self.encoding.unsqueeze(1)
+    def forward(self, x):
+        self.encoding = self.encoding.to(x.device)
+        x = x + self.encoding[:x.size(0)].detach()
+        return self.dropout(x)
+class PositionalEncoding2D(nn.Module):
+    def __init__(self, d_model, max_h=1000, max_w=1000, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.max_h = max_h
+        self.max_w = max_w
+        self.dropout = nn.Dropout(p=dropout)
+        # create self.encoding considering input x as the shape (B, d_model, H, W)
+        self.encoding = torch.zeros(max_h, max_w, d_model)
+        position_h = torch.arange(0, max_h).unsqueeze(1).float()
+        position_w = torch.arange(0, max_w).unsqueeze(1).float()
+        div_term_h = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
+        div_term_w = torch.exp(torch.arange(1, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
+        self.encoding[:, :, 0::2] = torch.sin(position_h * div_term_h).unsqueeze(1)
+        self.encoding[:, :, 1::2] = torch.cos(position_w * div_term_w).unsqueeze(0)
+        self.encoding = self.encoding.permute(2, 0, 1)
+    def forward(self, x):
+        self.encoding = self.encoding.to(x.device)
+        x = x + self.encoding[:, :x.size(2), :x.size(3)].detach()
+        return self.dropout(x)

math2latex/model/transformer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import torch.nn as nn
+import torchvision
+from .positional_encoding import PositionalEncoding1D, PositionalEncoding2D
+class ResNetTransformer(nn.Module):
+    def __init__(self,
+                 d_model=128,
+                 num_heads=4,
+                 num_decoder_layers=3,
+                 dim_feedforward=256,
+                 dropout=0.1,
+                 pos_enc_dropout=0.1,
+                 activation='relu',
+                 max_len_output=150,
+                 num_classes=462):
+        super().__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.num_decoder_layers = num_decoder_layers
+        self.dim_feedforward = dim_feedforward
+        self.dropout = dropout
+        self.activation = activation
+        self.num_classes = num_classes
+        self.max_len_output = max_len_output
+        # Encoder
+        resnet18 = torchvision.models.resnet18(weights=None)
+        # Remove the classification head and layer 4 from resnt18 and keep the first 3 layers
+        self.backbone = nn.Sequential(*list(resnet18.children())[:-3])
+        self.conv1x1 = nn.Conv2d(256, d_model, kernel_size=1, stride=1, padding=0)
+        self.encoder_pos_enc = PositionalEncoding2D(d_model,
+                                                    max_h=1000,
+                                                    max_w=1000,
+                                                    dropout=pos_enc_dropout) # no images are larger than 1000x1000
+        # Decoder
+        self.embedding = nn.Embedding(num_classes, d_model)
+        self.decoder_pos_enc = PositionalEncoding1D(d_model,
+                                                    max_len=max_len_output,
+                                                    dropout=pos_enc_dropout)
+        _transformer_decoder_layer = nn.TransformerDecoderLayer(d_model=d_model,
+                                                                nhead=num_heads,
+                                                                dim_feedforward=dim_feedforward,
+                                                                dropout=dropout,
+                                                                activation=activation)
+        self.transformer_decoder = nn.TransformerDecoder(decoder_layer=_transformer_decoder_layer,
+                                                         num_layers=num_decoder_layers)
+        self.linear = nn.Linear(d_model, num_classes)
+        # self.softmax = nn.Softmax(dim=2)
+        # get target mask for training
+        self.tgt_mask = self.get_tgt_mask(max_len_output)
+        if self.training:
+            self._init_weights()
+    def _init_weights(self):
+        self.embedding.weight.data.uniform_(-0.1, 0.1)
+        self.linear.weight.data.uniform_(-0.1, 0.1)
+        self.linear.bias.data.zero_()
+        nn.init.kaiming_normal_(self.conv1x1.weight, mode='fan_out', nonlinearity='relu')
+        if self.conv1x1.bias is not None:
+            _, fan_out = nn.init._calculate_fan_in_and_fan_out(self.conv1x1.weight)
+            bound = 1 / torch.sqrt(torch.tensor(fan_out))
+            nn.init.normal_(self.conv1x1.bias, -bound, bound)
+    def get_tgt_mask(self, target_size):
+        tgt_mask = torch.triu(torch.ones(target_size, target_size), diagonal=1)
+        tgt_mask = tgt_mask.masked_fill(tgt_mask == 1, float('-inf'))
+        return tgt_mask
+    def encode(self, x):
+        # Repeat the input x if it has only 1 channel
+        if x.shape[1] == 1:
+            x = x.repeat(1, 3, 1, 1)
+        x = self.backbone(x)
+        x = self.conv1x1(x)
+        x = self.encoder_pos_enc(x)
+        x = x.flatten(2)
+        x = x.permute(2, 0, 1)
+        return x
+    def decode(self, tgt, x):
+        tgt = tgt.permute(1, 0)
+        tgt = self.embedding(tgt)
+        tgt = self.decoder_pos_enc(tgt)
+        tgt_mask = self.tgt_mask[:tgt.size(0), :tgt.size(0)]
+        output = self.transformer_decoder(tgt, x, tgt_mask)
+        output = self.linear(output)
+        return output
+    def forward(self, x, tgt):
+        # Encoder
+        x = self.encode(x)
+        # Decoder
+        output = self.decode(tgt, x)
+        return output.permute(1, 2, 0)
+    def predict(self, x):
+        b = x.size(0)
+        x = self.encode(x)
+        tgt = torch.zeros((b, self.max_len_output), dtype=torch.long).to(x.device)
+        tgt[:, 0] = 1
+        for t in range(1, self.max_len_output):
+            output = self.decode(tgt[:, :t], x)
+            output = output.argmax(dim=-1)
+            tgt[:, t] = output[-1:]
+        return tgt

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# tested with python 3.11.10
+torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cpu
+torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+cuda-python==12.1.0
+lightning==2.3.3
+torchmetrics==1.4.2
+tensorboard==2.18.0
+matplotlib==3.9.2
+nltk==3.9.1
+torchtext==0.18.0
+gradio==4.44.1