aravindhank commited on
Commit
e32af19
·
verified ·
1 Parent(s): 3c15818

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +97 -0
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WordLLama - Indic
2
+
3
+ Inspired by WordLLama, trained using word embeddings of Saravam-1 models that supports most
4
+ Indic languages. We used translated subset of https://huggingface.co/datasets/sentence-transformers/all-nli
5
+ to train this model.
6
+
7
+ Weights and tokenizer is dereived from sarvam-1, For license terms refer to https://huggingface.co/sarvamai/sarvam-1.
8
+
9
+
10
+ ## How to use.
11
+
12
+ Install fork of WordLlama,
13
+ `pip install -e wordllama @ git+https://github.com/tinisoft/WordLlama.git`
14
+
15
+ Download the weights and tokenizer,
16
+ `git clone https://huggingface.co/tinisoft/wordllama-indic && cd wordllama-indic`
17
+
18
+ Code can be used like this,
19
+ ```
20
+ from wordllama import WordLlamaInference, WordLlamaConfig, WordLlama
21
+ from safetensors import safe_open
22
+ import toml
23
+ from tokenizers import Tokenizer
24
+
25
+ tokenizer = Tokenizer.from_file("tokenizer.json")
26
+ f = safe_open("sarvam1_2b_128.safetensors", framework="pt", device="cpu")
27
+ embedding = f.get_tensor('embedding.weight').numpy()
28
+
29
+ config_file = "sarvam1_2b.toml"
30
+ config_data = toml.load(config_file)
31
+ config_data["config_name"] = "sarvam1_2b"
32
+ config = WordLlamaConfig(**config_data)
33
+
34
+ wl = WordLlamaInference(
35
+ embedding=embedding,
36
+ tokenizer=tokenizer,
37
+ config=config,
38
+ binary=False,
39
+ )
40
+
41
+ # Calculate similarity between two sentences
42
+ similarity_score = wl.similarity("I went to the car", "I went to the pawn shop")
43
+ print(similarity_score) # Output: e.g., 0.0664
44
+
45
+ # Rank documents based on their similarity to a query
46
+ query = "I went to the car"
47
+ candidates = ["I went to the park", "I went to the shop", "I went to the truck", "I went to the vehicle"]
48
+ ranked_docs = wl.rank(query, candidates)
49
+ print(ranked_docs)
50
+
51
+
52
+ # Calculate similarity between two sentences in Tamil
53
+ similarity_score = wl.similarity("நான் கார் சென்றேன்", "நான் கடைக்கு சென்றேன்")
54
+ print(similarity_score) # Output: e.g., 0.075
55
+
56
+ # Rank documents based on their similarity to a Tamil query
57
+ query = "நான் கார் சென்றேன்"
58
+ candidates = [
59
+ "நான் பூங்காவிற்கு சென்றேன்",
60
+ "நான் கடைக்கு சென்றேன்",
61
+ "நான் லாரி சென்றேன்",
62
+ "நான் வாகனத்தில் சென்றேன்"
63
+ ]
64
+ ranked_docs = wl.rank(query, candidates)
65
+ print(ranked_docs)
66
+
67
+ query = "నేను కారులో వెళ్లాను"
68
+ candidates = [
69
+ "నేను పార్క్‌కి వెళ్లాను",
70
+ "నేను మార్కెట్‌కి వెళ్లాను",
71
+ "నేను లారీలో వెళ్లాను",
72
+ "నేను వాహనంలో వెళ్లాను"
73
+ ]
74
+ ranked_docs = wl.rank(query, candidates)
75
+ print(ranked_docs)
76
+ ```
77
+
78
+
79
+ ## Run code like this
80
+
81
+ ---
82
+ language:
83
+ - en
84
+ - ta
85
+ - ml
86
+ - as
87
+ - bn
88
+ - gu
89
+ - hi
90
+ - kn
91
+ - mr
92
+ - or
93
+ - te
94
+ base_model:
95
+ - sarvamai/sarvam-1
96
+ pipeline_tag: sentence-similarity
97
+ ---