nsthorat-lilac commited on
Commit
77bf495
1 Parent(s): af895fd

Push to HF space

Browse files
Files changed (4) hide show
  1. Dockerfile +4 -3
  2. README.md +1 -3
  3. data/lilac.yml +41 -569
  4. docker_start.sh +1 -1
Dockerfile CHANGED
@@ -25,14 +25,15 @@ RUN python -m pip install lilac[all]
25
  COPY --chown=user /dist ./dist/
26
  RUN python -m pip install --find-links=dist --upgrade lilac[all]
27
 
28
- COPY --chown=user .env .
29
- COPY --chown=user .env.demo .
 
30
  # Copy the README so we can read the datasets from the HuggingFace config.
31
  COPY --chown=user README.md .
32
  # Copy the license just in case.
33
  COPY --chown=user LICENSE .
34
 
35
- COPY --chown=user docker_start.sh docker_start.py ./
36
 
37
  # Make a local data directory for non-persistent storage demos.
38
  RUN mkdir -p ./data
 
25
  COPY --chown=user /dist ./dist/
26
  RUN python -m pip install --find-links=dist --upgrade lilac[all]
27
 
28
+ # Install the huggingface hub, used to download files.
29
+ RUN pip install huggingface_hub
30
+
31
  # Copy the README so we can read the datasets from the HuggingFace config.
32
  COPY --chown=user README.md .
33
  # Copy the license just in case.
34
  COPY --chown=user LICENSE .
35
 
36
+ COPY --chown=user docker_start.sh ./
37
 
38
  # Make a local data directory for non-persistent storage demos.
39
  RUN mkdir -p ./data
README.md CHANGED
@@ -6,13 +6,11 @@ colorTo: purple
6
  sdk: docker
7
  app_port: 5432
8
  datasets:
 
9
  - lilacai/lilac-piqa
10
  - lilacai/lilac-science-qa-derek-thomas
11
  - lilacai/lilac-enron-emails
12
- - lilacai/lilac-mmlu_professional_law
13
- - lilacai/lilac-pile-of-law-r-legaladvice
14
  - lilacai/lilac-open-asssistant-conversations
15
- - lilacai/lilac-squad_v2
16
  - lilacai/lilac-imdb
17
  - lilacai/lilac-opus100-en-es-validation
18
  - lilacai/lilac-databricks-dolly-15k-curated-en
 
6
  sdk: docker
7
  app_port: 5432
8
  datasets:
9
+ - lilacai/lilac-textbook_quality_programming
10
  - lilacai/lilac-piqa
11
  - lilacai/lilac-science-qa-derek-thomas
12
  - lilacai/lilac-enron-emails
 
 
13
  - lilacai/lilac-open-asssistant-conversations
 
14
  - lilacai/lilac-imdb
15
  - lilacai/lilac-opus100-en-es-validation
16
  - lilacai/lilac-databricks-dolly-15k-curated-en
data/lilac.yml CHANGED
@@ -279,277 +279,6 @@ datasets:
279
  - text
280
  markdown_paths: []
281
  preferred_embedding: gte-small
282
- - namespace: lilac
283
- name: squad_v2
284
- tags: []
285
- source:
286
- dataset_name: squad_v2
287
- config_name: null
288
- split: null
289
- sample_size: null
290
- revision: null
291
- load_from_disk: false
292
- source_name: huggingface
293
- embeddings:
294
- - path: context
295
- embedding: gte-small
296
- signals:
297
- - path: context
298
- signal:
299
- threshold: 0.85
300
- signal_name: near_dup
301
- - path: context
302
- signal:
303
- signal_name: pii
304
- - path: context
305
- signal:
306
- split_by_paragraph: false
307
- signal_name: lang_detection
308
- - path: context
309
- signal:
310
- embedding: gte-small
311
- namespace: lilac
312
- concept_name: positive-sentiment
313
- draft: main
314
- signal_name: concept_score
315
- - path: context
316
- signal:
317
- embedding: gte-small
318
- namespace: lilac
319
- concept_name: non-english
320
- draft: main
321
- signal_name: concept_score
322
- - path: context
323
- signal:
324
- embedding: gte-small
325
- namespace: lilac
326
- concept_name: toxicity
327
- draft: main
328
- signal_name: concept_score
329
- - path: context
330
- signal:
331
- embedding: gte-small
332
- namespace: lilac
333
- concept_name: question
334
- draft: main
335
- signal_name: concept_score
336
- - path: context
337
- signal:
338
- embedding: gte-small
339
- namespace: lilac
340
- concept_name: legal-termination
341
- draft: main
342
- signal_name: concept_score
343
- - path: context
344
- signal:
345
- embedding: gte-small
346
- namespace: lilac
347
- concept_name: source-code
348
- draft: main
349
- signal_name: concept_score
350
- - path: context
351
- signal:
352
- embedding: gte-small
353
- namespace: lilac
354
- concept_name: negative-sentiment
355
- draft: main
356
- signal_name: concept_score
357
- - path: context
358
- signal:
359
- embedding: gte-small
360
- namespace: lilac
361
- concept_name: profanity
362
- draft: main
363
- signal_name: concept_score
364
- - path: context
365
- signal:
366
- signal_name: text_statistics
367
- - path: question
368
- signal:
369
- threshold: 0.85
370
- signal_name: near_dup
371
- - path: question
372
- signal:
373
- signal_name: pii
374
- - path: question
375
- signal:
376
- split_by_paragraph: false
377
- signal_name: lang_detection
378
- - path: question
379
- signal:
380
- signal_name: text_statistics
381
- - path:
382
- - answers
383
- - text
384
- - '*'
385
- signal:
386
- threshold: 0.85
387
- signal_name: near_dup
388
- - path:
389
- - answers
390
- - text
391
- - '*'
392
- signal:
393
- signal_name: pii
394
- - path:
395
- - answers
396
- - text
397
- - '*'
398
- signal:
399
- split_by_paragraph: false
400
- signal_name: lang_detection
401
- - path:
402
- - answers
403
- - text
404
- - '*'
405
- signal:
406
- signal_name: text_statistics
407
- - path: question
408
- signal:
409
- embedding: gte-small
410
- namespace: lilac
411
- concept_name: legal-termination
412
- draft: main
413
- signal_name: concept_score
414
- - path: question
415
- signal:
416
- embedding: gte-small
417
- namespace: lilac
418
- concept_name: negative-sentiment
419
- draft: main
420
- signal_name: concept_score
421
- - path: question
422
- signal:
423
- embedding: gte-small
424
- namespace: lilac
425
- concept_name: non-english
426
- draft: main
427
- signal_name: concept_score
428
- - path: question
429
- signal:
430
- embedding: gte-small
431
- namespace: lilac
432
- concept_name: positive-sentiment
433
- draft: main
434
- signal_name: concept_score
435
- - path: question
436
- signal:
437
- embedding: gte-small
438
- namespace: lilac
439
- concept_name: profanity
440
- draft: main
441
- signal_name: concept_score
442
- - path: question
443
- signal:
444
- embedding: gte-small
445
- namespace: lilac
446
- concept_name: question
447
- draft: main
448
- signal_name: concept_score
449
- - path: question
450
- signal:
451
- embedding: gte-small
452
- namespace: lilac
453
- concept_name: source-code
454
- draft: main
455
- signal_name: concept_score
456
- - path: question
457
- signal:
458
- embedding: gte-small
459
- namespace: lilac
460
- concept_name: toxicity
461
- draft: main
462
- signal_name: concept_score
463
- - path:
464
- - answers
465
- - text
466
- - '*'
467
- signal:
468
- embedding: gte-small
469
- namespace: lilac
470
- concept_name: legal-termination
471
- draft: main
472
- signal_name: concept_score
473
- - path:
474
- - answers
475
- - text
476
- - '*'
477
- signal:
478
- embedding: gte-small
479
- namespace: lilac
480
- concept_name: negative-sentiment
481
- draft: main
482
- signal_name: concept_score
483
- - path:
484
- - answers
485
- - text
486
- - '*'
487
- signal:
488
- embedding: gte-small
489
- namespace: lilac
490
- concept_name: non-english
491
- draft: main
492
- signal_name: concept_score
493
- - path:
494
- - answers
495
- - text
496
- - '*'
497
- signal:
498
- embedding: gte-small
499
- namespace: lilac
500
- concept_name: positive-sentiment
501
- draft: main
502
- signal_name: concept_score
503
- - path:
504
- - answers
505
- - text
506
- - '*'
507
- signal:
508
- embedding: gte-small
509
- namespace: lilac
510
- concept_name: profanity
511
- draft: main
512
- signal_name: concept_score
513
- - path:
514
- - answers
515
- - text
516
- - '*'
517
- signal:
518
- embedding: gte-small
519
- namespace: lilac
520
- concept_name: question
521
- draft: main
522
- signal_name: concept_score
523
- - path:
524
- - answers
525
- - text
526
- - '*'
527
- signal:
528
- embedding: gte-small
529
- namespace: lilac
530
- concept_name: source-code
531
- draft: main
532
- signal_name: concept_score
533
- - path:
534
- - answers
535
- - text
536
- - '*'
537
- signal:
538
- embedding: gte-small
539
- namespace: lilac
540
- concept_name: toxicity
541
- draft: main
542
- signal_name: concept_score
543
- settings:
544
- ui:
545
- media_paths:
546
- - context
547
- - question
548
- - - answers
549
- - text
550
- - '*'
551
- markdown_paths: []
552
- preferred_embedding: gte-small
553
  - namespace: lilac
554
  name: databricks-dolly-15k-curated-en
555
  tags: []
@@ -1735,319 +1464,28 @@ datasets:
1735
  markdown_paths: []
1736
  preferred_embedding: gte-small
1737
  - namespace: lilac
1738
- name: mmlu_professional_law
1739
  tags: []
1740
  source:
1741
- dataset_name: cais/mmlu
1742
- config_name: professional_law
1743
  split: null
1744
  sample_size: null
1745
  revision: null
1746
  load_from_disk: false
1747
  source_name: huggingface
1748
  embeddings:
1749
- - path: question
1750
- embedding: gte-small
1751
- - path:
1752
- - choices
1753
- - '*'
1754
  embedding: gte-small
1755
  signals:
1756
- - path: question
1757
  signal:
1758
  threshold: 0.85
1759
  signal_name: near_dup
1760
- - path: question
1761
  signal:
1762
  signal_name: pii
1763
- - path: question
1764
- signal:
1765
- split_by_paragraph: false
1766
- signal_name: lang_detection
1767
- - path: question
1768
- signal:
1769
- embedding: gte-small
1770
- namespace: lilac
1771
- concept_name: positive-sentiment
1772
- draft: main
1773
- signal_name: concept_score
1774
- - path: question
1775
- signal:
1776
- embedding: gte-small
1777
- namespace: lilac
1778
- concept_name: non-english
1779
- draft: main
1780
- signal_name: concept_score
1781
- - path: question
1782
- signal:
1783
- embedding: gte-small
1784
- namespace: lilac
1785
- concept_name: toxicity
1786
- draft: main
1787
- signal_name: concept_score
1788
- - path: question
1789
- signal:
1790
- embedding: gte-small
1791
- namespace: lilac
1792
- concept_name: question
1793
- draft: main
1794
- signal_name: concept_score
1795
- - path: question
1796
- signal:
1797
- embedding: gte-small
1798
- namespace: lilac
1799
- concept_name: legal-termination
1800
- draft: main
1801
- signal_name: concept_score
1802
- - path: question
1803
- signal:
1804
- embedding: gte-small
1805
- namespace: lilac
1806
- concept_name: source-code
1807
- draft: main
1808
- signal_name: concept_score
1809
- - path: question
1810
- signal:
1811
- embedding: gte-small
1812
- namespace: lilac
1813
- concept_name: negative-sentiment
1814
- draft: main
1815
- signal_name: concept_score
1816
- - path: question
1817
- signal:
1818
- embedding: gte-small
1819
- namespace: lilac
1820
- concept_name: profanity
1821
- draft: main
1822
- signal_name: concept_score
1823
- - path: question
1824
- signal:
1825
- signal_name: text_statistics
1826
- - path:
1827
- - choices
1828
- - '*'
1829
- signal:
1830
- threshold: 0.85
1831
- signal_name: near_dup
1832
- - path:
1833
- - choices
1834
- - '*'
1835
- signal:
1836
- signal_name: pii
1837
- - path:
1838
- - choices
1839
- - '*'
1840
- signal:
1841
- split_by_paragraph: false
1842
- signal_name: lang_detection
1843
- - path:
1844
- - choices
1845
- - '*'
1846
- signal:
1847
- embedding: gte-small
1848
- namespace: lilac
1849
- concept_name: positive-sentiment
1850
- draft: main
1851
- signal_name: concept_score
1852
- - path:
1853
- - choices
1854
- - '*'
1855
- signal:
1856
- embedding: gte-small
1857
- namespace: lilac
1858
- concept_name: non-english
1859
- draft: main
1860
- signal_name: concept_score
1861
- - path:
1862
- - choices
1863
- - '*'
1864
- signal:
1865
- embedding: gte-small
1866
- namespace: lilac
1867
- concept_name: toxicity
1868
- draft: main
1869
- signal_name: concept_score
1870
- - path:
1871
- - choices
1872
- - '*'
1873
- signal:
1874
- embedding: gte-small
1875
- namespace: lilac
1876
- concept_name: question
1877
- draft: main
1878
- signal_name: concept_score
1879
- - path:
1880
- - choices
1881
- - '*'
1882
- signal:
1883
- embedding: gte-small
1884
- namespace: lilac
1885
- concept_name: legal-termination
1886
- draft: main
1887
- signal_name: concept_score
1888
- - path:
1889
- - choices
1890
- - '*'
1891
- signal:
1892
- embedding: gte-small
1893
- namespace: lilac
1894
- concept_name: source-code
1895
- draft: main
1896
- signal_name: concept_score
1897
- - path:
1898
- - choices
1899
- - '*'
1900
- signal:
1901
- embedding: gte-small
1902
- namespace: lilac
1903
- concept_name: negative-sentiment
1904
- draft: main
1905
- signal_name: concept_score
1906
- - path:
1907
- - choices
1908
- - '*'
1909
- signal:
1910
- embedding: gte-small
1911
- namespace: lilac
1912
- concept_name: negative-sentiment
1913
- draft: main
1914
- signal_name: concept_score
1915
- - path:
1916
- - choices
1917
- - '*'
1918
- signal:
1919
- embedding: gte-small
1920
- namespace: lilac
1921
- concept_name: profanity
1922
- draft: main
1923
- signal_name: concept_score
1924
- - path:
1925
- - choices
1926
- - '*'
1927
- signal:
1928
- signal_name: text_statistics
1929
- settings:
1930
- ui:
1931
- media_paths:
1932
- - question
1933
- - - choices
1934
- - '*'
1935
- markdown_paths: []
1936
- preferred_embedding: gte-small
1937
- - namespace: lilac
1938
- name: pile-of-law-r-legaladvice
1939
- tags: []
1940
- source:
1941
- dataset_name: pile-of-law/pile-of-law
1942
- config_name: r_legaladvice
1943
- split: null
1944
- sample_size: null
1945
- revision: null
1946
- load_from_disk: false
1947
- source_name: huggingface
1948
- embeddings:
1949
- - path: text
1950
- embedding: gte-small
1951
- signals:
1952
- - path: text
1953
- signal:
1954
- threshold: 0.85
1955
- signal_name: near_dup
1956
- - path: text
1957
- signal:
1958
- signal_name: pii
1959
- - path: text
1960
- signal:
1961
- split_by_paragraph: false
1962
- signal_name: lang_detection
1963
- - path: text
1964
- signal:
1965
- embedding: gte-small
1966
- namespace: lilac
1967
- concept_name: positive-sentiment
1968
- draft: main
1969
- signal_name: concept_score
1970
- - path: text
1971
- signal:
1972
- embedding: gte-small
1973
- namespace: lilac
1974
- concept_name: non-english
1975
- draft: main
1976
- signal_name: concept_score
1977
- - path: text
1978
- signal:
1979
- embedding: gte-small
1980
- namespace: lilac
1981
- concept_name: toxicity
1982
- draft: main
1983
- signal_name: concept_score
1984
- - path: text
1985
- signal:
1986
- embedding: gte-small
1987
- namespace: lilac
1988
- concept_name: question
1989
- draft: main
1990
- signal_name: concept_score
1991
- - path: text
1992
- signal:
1993
- embedding: gte-small
1994
- namespace: lilac
1995
- concept_name: legal-termination
1996
- draft: main
1997
- signal_name: concept_score
1998
- - path: text
1999
- signal:
2000
- embedding: gte-small
2001
- namespace: lilac
2002
- concept_name: source-code
2003
- draft: main
2004
- signal_name: concept_score
2005
- - path: text
2006
- signal:
2007
- embedding: gte-small
2008
- namespace: lilac
2009
- concept_name: negative-sentiment
2010
- draft: main
2011
- signal_name: concept_score
2012
- - path: text
2013
- signal:
2014
- embedding: gte-small
2015
- namespace: lilac
2016
- concept_name: profanity
2017
- draft: main
2018
- signal_name: concept_score
2019
- - path: text
2020
- signal:
2021
- signal_name: text_statistics
2022
- settings:
2023
- ui:
2024
- media_paths:
2025
- - text
2026
- markdown_paths: []
2027
- preferred_embedding: gte-small
2028
- - namespace: lilac
2029
- name: science-qa-derek-thomas
2030
- tags: []
2031
- source:
2032
- dataset_name: derek-thomas/ScienceQA
2033
- config_name: null
2034
- split: null
2035
- sample_size: null
2036
- revision: null
2037
- load_from_disk: false
2038
- source_name: huggingface
2039
- embeddings:
2040
- - path: lecture
2041
- embedding: gte-small
2042
- signals:
2043
- - path: lecture
2044
- signal:
2045
- threshold: 0.85
2046
- signal_name: near_dup
2047
- - path: lecture
2048
- signal:
2049
- signal_name: pii
2050
- - path: lecture
2051
  signal:
2052
  split_by_paragraph: false
2053
  signal_name: lang_detection
@@ -2297,5 +1735,39 @@ datasets:
2297
  - overview
2298
  markdown_paths: []
2299
  preferred_embedding: gte-small
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2300
  signals: []
2301
  concept_model_cache_embeddings: []
 
279
  - text
280
  markdown_paths: []
281
  preferred_embedding: gte-small
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  - namespace: lilac
283
  name: databricks-dolly-15k-curated-en
284
  tags: []
 
1464
  markdown_paths: []
1465
  preferred_embedding: gte-small
1466
  - namespace: lilac
1467
+ name: science-qa-derek-thomas
1468
  tags: []
1469
  source:
1470
+ dataset_name: derek-thomas/ScienceQA
1471
+ config_name: null
1472
  split: null
1473
  sample_size: null
1474
  revision: null
1475
  load_from_disk: false
1476
  source_name: huggingface
1477
  embeddings:
1478
+ - path: lecture
 
 
 
 
1479
  embedding: gte-small
1480
  signals:
1481
+ - path: lecture
1482
  signal:
1483
  threshold: 0.85
1484
  signal_name: near_dup
1485
+ - path: lecture
1486
  signal:
1487
  signal_name: pii
1488
+ - path: lecture
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1489
  signal:
1490
  split_by_paragraph: false
1491
  signal_name: lang_detection
 
1735
  - overview
1736
  markdown_paths: []
1737
  preferred_embedding: gte-small
1738
+ - namespace: lilac
1739
+ name: textbook_quality_programming
1740
+ tags: []
1741
+ source:
1742
+ dataset_name: vikp/textbook_quality_programming
1743
+ config_name: null
1744
+ split: null
1745
+ sample_size: null
1746
+ revision: null
1747
+ load_from_disk: false
1748
+ source_name: huggingface
1749
+ embeddings:
1750
+ - path:
1751
+ - outline
1752
+ - '*'
1753
+ embedding: gte-small
1754
+ - path:
1755
+ - concepts
1756
+ - '*'
1757
+ embedding: gte-small
1758
+ - path: markdown
1759
+ embedding: gte-small
1760
+ signals: []
1761
+ settings:
1762
+ ui:
1763
+ media_paths:
1764
+ - - outline
1765
+ - '*'
1766
+ - - concepts
1767
+ - '*'
1768
+ - markdown
1769
+ markdown_paths:
1770
+ - markdown
1771
+ preferred_embedding: gte-small
1772
  signals: []
1773
  concept_model_cache_embeddings: []
docker_start.sh CHANGED
@@ -3,7 +3,7 @@
3
  # Fail if any of the commands below fail.
4
  set -e
5
 
6
- python docker_start.py
7
  gunicorn lilac.server:app \
8
  --bind 0.0.0.0:5432 \
9
  --preload -k uvicorn.workers.UvicornWorker \
 
3
  # Fail if any of the commands below fail.
4
  set -e
5
 
6
+ lilac hf-docker-start
7
  gunicorn lilac.server:app \
8
  --bind 0.0.0.0:5432 \
9
  --preload -k uvicorn.workers.UvicornWorker \