#! /usr/bin/python3 import os,json tgt="KoichiYasuoka/modernbert-large-english-ud-triangular" url="https://github.com/UniversalDependencies/UD_English-" for e in ["EWT","GUM","Atis","ParTUT","LinES"]: u=url+e d=os.path.basename(u) os.system("test -d "+d+" || git clone --depth=1 "+u) s='BEGIN{FS="\\t";OFS="\\t"};{if(NF==10){if($1~/^[1-9][0-9]*-/){split($1,a,"-");if($10~/SpaceAfter=No/)a[2]++}else if($1-a[1]>=0&&$1-a[2]<0)$10=($10=="_")?"SpaceAfter=No":$10"|SpaceAfter=No"}print}' os.system("for F in train dev test ; do nawk '"+s+"' UD_English-*/*-$F.conllu > $F.conllu ; done") os.system(""" if test -d transformers then : else git clone --depth=1 https://github.com/huggingface/transformers transformers-all ln -s transformers-all/src/transformers transformers fi test -d ModernBERT-large || git clone --depth=1 https://huggingface.co/answerdotai/ModernBERT-large test -f ModernBERT-large/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-large/configuration_modernbert.py test -f ModernBERT-large/modeling_modernbert.py || sed -e 's/^from \\.\\.\\./from transformers./' -e 's/^from .* import is_triton_available/import importlib\\nis_triton_available = lambda: importlib.util.find_spec("triton") is not None/' transformers/models/modernbert/modeling_modernbert.py > ModernBERT-large/modeling_modernbert.py """) with open("ModernBERT-large/config.json","r",encoding="utf-8") as r: d=json.load(r) if not "auto_map" in d: d["auto_map"]={ "AutoConfig":"configuration_modernbert.ModernBertConfig", "AutoModel":"modeling_modernbert.ModernBertModel", "AutoModelForMaskedLM":"modeling_modernbert.ModernBertForMaskedLM", "AutoModelForSequenceClassification":"modeling_modernbert.ModernBertForSequenceClassification", "AutoModelForTokenClassification":"modeling_modernbert.ModernBertForTokenClassification" } with open("ModernBERT-large/config.json","w",encoding="utf-8") as w: json.dump(d,w,indent=2) class UDTriangularDataset(object): def __init__(self,conllu,tokenizer): self.conllu=open(conllu,"r",encoding="utf-8") self.tokenizer=tokenizer self.seeks=[0] label=set(["SYM|x","X|x"]) dep=set(["X|x|r-goeswith"]) s=self.conllu.readline() while s!="": if s=="\n": self.seeks.append(self.conllu.tell()) else: w=s.split("\t") if len(w)==10: if w[0].isdecimal(): p=w[3] q="" if w[5]=="_" else "|"+w[5] d=("|" if w[6]=="0" else "|l-" if int(w[0])i or sum([1 if j==i+1 else 0 for j in h[i+1:]])>0 else "x" for i,k in enumerate(h)] p=[t[3]+"|"+x[i] if t[5]=="_" else t[3]+"|"+x[i]+"|"+t[5] for i,t in enumerate(c)] d=[t[7] if t[6]=="0" else "l-"+t[7] if int(t[0])