minskiter commited on
Commit
1e4d53d
1 Parent(s): a70d44f

feat(app.py): update app.py

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/__pycache__
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from predictor import Predictor
2
+ from transformers import pipeline
3
+ from huggingface_hub import login
4
+ from datetime import date
5
+ import os
6
+ import gradio as gr
7
+
8
+ login(os.environ["HF_Token"])
9
+ paths = [
10
+ "data/W020230619818476939351.xls",
11
+ "data/W020230619818476975218.xls"
12
+ ]
13
+ predictor = Predictor(
14
+ pipelines={
15
+ "name": pipeline("nerpipe", model="minskiter/resume-token-classification-name-0708",trust_remote_code=True,use_auth_token=True),
16
+ "common": pipeline("nerpipe",model="minskiter/resume-token-classification",trust_remote_code=True,use_auth_token=True)
17
+ },
18
+ paths=paths,
19
+ today=date(2023,4,1)
20
+ )
21
+
22
+ def ner_predictor_gradio(input):
23
+ entities = predictor(input)
24
+ # flattern entities
25
+ flatterns = []
26
+ for key in entities:
27
+ if isinstance(entities[key],list):
28
+ for item in entities[key]:
29
+ if isinstance(item,list):
30
+ for subitem in item:
31
+ flatterns.append(subitem)
32
+ else:
33
+ flatterns.append(item)
34
+ return {"text":input, "entities": flatterns}
35
+
36
+ demo = gr.Interface(
37
+ fn=ner_predictor_gradio,
38
+ inputs=gr.Textbox(lines=5, label="输入你的简历"),
39
+ outputs=gr.HighlightedText(label="简历识别结果"),
40
+ )
41
+ demo.launch()
42
+
43
+
data/W020230619818476939351.xls ADDED
Binary file (429 kB). View file
 
data/W020230619818476975218.xls ADDED
Binary file (56.3 kB). View file
 
predictor/__init__.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict,List
2
+ from transformers import Pipeline
3
+ import requests
4
+ import re
5
+ from io import BytesIO
6
+ import pandas as pd
7
+ import math
8
+ import queue
9
+ from datetime import date
10
+
11
+ class Predictor():
12
+
13
+ def __init__(
14
+ self,
15
+ pipelines: Dict[str, Pipeline] = {},
16
+ paths: List[str] = [],
17
+ today: date = date.today()
18
+ ) -> None:
19
+ if "name" not in pipelines:
20
+ raise ValueError("'name' pipeline is None")
21
+ if "common" not in pipelines:
22
+ raise ValueError("'common' pipeline is None")
23
+ self.pipelines = pipelines
24
+ self.today = today
25
+ self.__init_split_data()
26
+ self.__init_schools_data(paths)
27
+ self.__init_patterns()
28
+
29
+ def __init_patterns(
30
+ self
31
+ ):
32
+ last_name = r"[赵,钱,孙,李,周,吴,郑,王,冯,陈,楮,卫,蒋,沈,韩,杨,朱,秦,尤,许,何,吕,施,张,孔,曹,严,华,金,魏,陶,姜,戚,谢,邹,喻,"\
33
+ +r"柏,水,窦,章,云,苏,潘,葛,奚,范,彭,郎,鲁,韦,昌,马,苗,凤,花,方,俞,任,袁,柳,酆,鲍,史,唐,费,廉,岑,薛,雷,贺,倪,汤,滕,殷,罗," \
34
+ + r"毕,郝,邬,安,常,乐,于,时,傅,皮,卞,齐,康,伍,余,元,卜,顾,孟,平,黄,和,穆,萧,尹,姚,邵,湛,汪,祁,毛,禹,狄,米,贝,明,臧,计,伏,成,戴,谈,宋,茅," \
35
+ + r"庞,熊,纪,舒,屈,项,祝,董,梁,杜,阮,蓝,闽,席,季,麻,强,贾,路,娄,危,江,童,颜,郭,梅,盛,林,刁,锺,徐,丘,骆,高,夏,蔡,田,樊,胡,凌,霍,虞,万,支," \
36
+ + r"柯,昝,管,卢,莫,经,房,裘,缪,干,解,应,宗,丁,宣,贲,邓,郁,单,杭,洪,包,诸,左,石,崔,吉,钮,龚,程,嵇,邢,滑,裴,陆,荣,翁,荀,羊,於,惠,甄,麹,家," \
37
+ + r"封,芮,羿,储,靳,汲,邴,糜,松,井,段,富,巫,乌,焦,巴,弓,牧,隗,山,谷,车,侯,宓,蓬,全,郗,班,仰,秋,仲,伊,宫,宁,仇,栾,暴,甘,斜,厉,戎,祖,武,符," \
38
+ + r"刘,景,詹,束,龙,叶,幸,司,韶,郜,黎,蓟,薄,印,宿,白,怀,蒲,邰,从,鄂,索,咸,籍,赖,卓,蔺,屠,蒙,池,乔,阴,郁,胥,能,苍,双,闻,莘,党,翟,谭,贡,劳," \
39
+ + r"逄,姬,申,扶,堵,冉,宰,郦,雍,郤,璩,桑,桂,濮,牛,寿,通,边,扈,燕,冀,郏,浦,尚,农,温,别,庄,晏,柴,瞿,阎,充,慕,连,茹,习,宦,艾,鱼,容,向,古,易," \
40
+ + r"慎,戈,廖,庾,终,暨,居,衡,步,都,耿,满,弘,匡,国,文,寇,广,禄,阙,东,欧,殳,沃,利,蔚,越,夔,隆,师,巩,厍,聂,晁,勾,敖,融,冷,訾,辛,阚,那,简,饶," \
41
+ + r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
42
+ + r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
43
+ + r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
44
+ + r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱]"
45
+ first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
46
+ self.name_pattern = re.compile(last_name + first_name)
47
+ self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
48
+ self.email_pattern = re.compile(r'([a-zA-Z0-9_-] {0,4})+@([a-zA-Z0-9_-] {0,4})+(\. {0,4}([a-zA-Z0-9_-] {0,4})+)+')
49
+ self.gender_pattern = re.compile(r'(性 {0,8}别.*?)?\s*?(男|女)\s+?')
50
+ self.age_patterns = [
51
+ re.compile(r"(\d{1,2})岁|年龄.{0,10}(\d{1,2})"),
52
+ re.compile(r"生.{0,12}(([12]\d{3})[年|.]?(([01]?\d)[月|.]?)?(([0-3]?\d)[日|.]?)?)"),
53
+ ]
54
+ self.works_key_pattern = re.compile("工作经(历|验)|experience",re.M|re.I)
55
+ self.job_time_patterns = re.compile('([1-2]\d{3}(\D?[01]?\d){0,2})\D?([1-2]\d{3}(\D?[01]?\d){0,2}|至今)')
56
+ self.edu_index = ["博士","硕士","研究生","学士","本科","大专","专科","中专","高中","初中","小学"]
57
+ self.edu_patterns = list(re.compile(i) for i in self.edu_index)
58
+ self.school_pattern = re.compile(r"([a-zA-Z0-9 \u4e00-\u9fa5]{1,18}(学院|大学|中学|小学|学校|Unverisity|College))")
59
+
60
+ def _is_url(self, path: str):
61
+ return path.startswith('http://') or path.startswith('https://')
62
+
63
+ def __init_schools_data(
64
+ self,
65
+ paths: List[str],
66
+ ):
67
+ schools = {}
68
+ headers = {
69
+ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
70
+ }
71
+ for path in paths:
72
+ stream = None
73
+ if self._is_url(path):
74
+ res = requests.get(path,headers=headers)
75
+
76
+ if res.status_code==200:
77
+ stream = BytesIO(res.content)
78
+ else:
79
+ with open(path, 'rb') as f:
80
+ stream = BytesIO(f.read())
81
+ df = pd.read_excel(stream)
82
+ for row in df.iterrows():
83
+ if isinstance(row[1][1],float) and math.isnan(row[1][1]):
84
+ continue
85
+ if row[1][1]=='学校名称':
86
+ continue
87
+ # [学校] = 学历(本科、专科)
88
+ if len(row[1])>5:
89
+ schools[row[1][1]] = row[1][5]
90
+ else:
91
+ schools[row[1][1]] = "成人学校"
92
+ self.schools = schools
93
+ if len(schools)==0:
94
+ raise ValueError("学校数据为空")
95
+
96
+ def __init_split_data(
97
+ self
98
+ ):
99
+ self.splits = {'\\', '_', '"', '%', '{', '《', ')', '$', '(', '\n', '~', '*', ':', '!', ';', '”', '’', '\t', '?', '-', ';', '》', '】', '`', '、', '+', '“', '[', '—', '·', ')', '=', '‘', '}', '?', ',', '&', '@', '#', ']', '——', ' ', '.', '【', "'", '>', ',', '/', ':', '。', '...', '^', '(', '<', '|', '……', '!'}
100
+
101
+ def to_date(self, datestr:str):
102
+ if re.match("^\d{4}$",datestr):
103
+ return date(int(datestr),1,1)
104
+ match = re.match("^\d{4}(\D)\d{1,2}",datestr)
105
+ if match is not None:
106
+ try:
107
+ m = min(max(int(datestr.split(match.group(1))[1]),1),12)
108
+ return date(int(datestr.split(match.group(1))[0]),m,1)
109
+ except ValueError:
110
+ print(int(datestr.split(match.group(1))[0]),int(datestr.split(match.group(1))[1]))
111
+ raise
112
+ if datestr=="至今":
113
+ return self.today
114
+ return self.today
115
+
116
+ def split_to_blocks(
117
+ self,
118
+ text: str,
119
+ max_block_len: int = 510,
120
+ overlap: bool = True,
121
+ max_overlap_len: int = 20,
122
+ ):
123
+ block = {
124
+ "start": -1,
125
+ "end": -1,
126
+ "text": "",
127
+ }
128
+ blocks = []
129
+ overlap_end = queue.Queue()
130
+ for i in range(len(text)):
131
+ if text[i] in self.splits:
132
+ if block["start"]==-1:
133
+ continue
134
+ if block["end"]!=-1 and i-block['start']>=max_block_len:
135
+ block["text"] = text[block["start"]:block["end"]]
136
+ blocks.append(block)
137
+ block = {
138
+ "start": overlap_end.queue[0]+1 if overlap else block['end']+1,
139
+ "end": -1,
140
+ "text": "",
141
+ }
142
+ block["end"] = i
143
+ while overlap_end.qsize()>0 and overlap_end.queue[0]+max_overlap_len<=i:
144
+ overlap_end.get()
145
+ overlap_end.put(i)
146
+ else:
147
+ if block["start"]==-1:
148
+ block["start"] = i
149
+ # last block
150
+ if block["start"]!=-1:
151
+ block["end"] = len(text)
152
+ block["text"] = text[block["start"]:block["end"]]
153
+ blocks.append(block)
154
+ return blocks
155
+
156
+ def get_expand_span(
157
+ self,
158
+ text: str,
159
+ start: int,
160
+ end: int,
161
+ max_expand_length=20,
162
+ ):
163
+ expand_l,expand_r = start,end
164
+ for l in range(max(start-max_expand_length,0), start):
165
+ if text[l] in self.splits:
166
+ expand_l = l+1
167
+ break
168
+ for r in range(min(end+max_expand_length,len(text)-1), end, -1):
169
+ if text[r] in self.splits:
170
+ expand_r = r
171
+ break
172
+ return text[expand_l:expand_r], expand_l, expand_r
173
+
174
+ def remove_blanks(
175
+ self,
176
+ text: str,
177
+ blank_pattern: re.Pattern,
178
+ ):
179
+ index_mapper = {}
180
+ new_text = []
181
+ for i in range(len(text)):
182
+ if blank_pattern.match(text[i]) is not None:
183
+ continue
184
+ index_mapper[len(new_text)] = i
185
+ new_text.append(text[i])
186
+ return ''.join(new_text), index_mapper
187
+
188
+ def process(self, text)->Dict[str, Any]:
189
+ return_obj = {
190
+ "name": [],
191
+ "age": [],
192
+ "gender": [],
193
+ "phone": [],
194
+ "email": [],
195
+ "schools": [],
196
+ "work_time": 0,
197
+ "edus": [],
198
+ "jobs": [],
199
+ "titles": []
200
+ }
201
+ # 获取名字,先过滤所有空白字符,防止名字中间有空格
202
+ remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
203
+ for name_match in self.name_pattern.finditer(remove_blanks_text):
204
+ start,end = name_match.span()
205
+ expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
206
+ entities = self.pipelines['name'](expand_text)
207
+ for entity in entities:
208
+ if entity['entity']=='NAME' and self.name_pattern.match(entity['word']) is not None:
209
+ obj = {
210
+ 'start': index_mapper[start+entity['start']],
211
+ 'end': index_mapper[start+entity['end']-1]+1,
212
+ 'entity': 'NAME',
213
+ 'text': entity['word']
214
+ }
215
+ repeat = False
216
+ for o in return_obj['name']:
217
+ if obj['start']==o['start'] and obj['end']==o['end']:
218
+ repeat = True
219
+ break
220
+ if not repeat:
221
+ obj['origin'] = text[obj['start']:obj['end']]
222
+ return_obj['name'].append(obj)
223
+ # 获取年龄
224
+ for age_match in self.age_patterns[0].finditer(remove_blanks_text):
225
+ age = None
226
+ s,e = -1,-1
227
+ if age_match.group(1) is not None:
228
+ age = age_match.group(1)
229
+ s,e = age_match.span(1)
230
+ elif age_match.group(2) is not None:
231
+ age = age_match.group(2)
232
+ s,e = age_match.span(1)
233
+ if age is not None:
234
+ return_obj['age'].append({
235
+ 'start': index_mapper[s],
236
+ 'end': index_mapper[e-1]+1,
237
+ 'text': str(age),
238
+ 'entity': 'AGE',
239
+ 'origin': text[index_mapper[s]:index_mapper[e-1]+1]
240
+ })
241
+ for age_match in self.age_patterns[1].finditer(remove_blanks_text):
242
+ age = None
243
+ s,e = -1,-1
244
+ year = age_match.group(2)
245
+ if year is not None:
246
+ year = int(year)
247
+ month = age_match.group(4)
248
+ if month is not None:
249
+ month = int(month)
250
+ else:
251
+ month = 1
252
+ day = age_match.group(6)
253
+ if day is not None:
254
+ day = int(day)
255
+ else:
256
+ day = 1
257
+ age = date.today().year - year
258
+ if date.today().month<month or (date.today().month==month and date.today().day<day):
259
+ age -= 1
260
+ if age is not None:
261
+ s,e = age_match.span(1)
262
+ return_obj['age'].append({
263
+ 'start': index_mapper[s],
264
+ 'end': index_mapper[e-1]+1,
265
+ 'text': str(age),
266
+ 'entity': 'AGE',
267
+ 'origin': text[index_mapper[s]:index_mapper[e-1]+1]
268
+ })
269
+ # 获取学校
270
+ for school_match in self.school_pattern.finditer(remove_blanks_text):
271
+ start,end = school_match.span()
272
+ expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
273
+ entities = self.pipelines['common'](expand_text)
274
+ for entity in entities:
275
+ if entity['entity']=="ORG" and self.school_pattern.search(entity['word']) is not None:
276
+ obj = {
277
+ 'start': index_mapper[start+entity['start']],
278
+ 'end': index_mapper[start+entity['end']-1]+1,
279
+ 'entity': 'SCHOOL'
280
+ }
281
+ for school in self.schools:
282
+ if school in entity['word']:
283
+ obj['text'] = school
284
+ obj["level"] = self.schools[school]
285
+ break
286
+ repeat = False
287
+ for o in return_obj['schools']:
288
+ if obj['start']==o['start'] and obj['end']==o['end']:
289
+ repeat = True
290
+ break
291
+ if not repeat:
292
+ obj['origin'] = text[obj['start']:obj['end']]
293
+ return_obj['schools'].append(obj)
294
+ # 正则找学校
295
+ for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
296
+ start,end = school_match.span()
297
+ obj = {
298
+ 'start': index_mapper[start],
299
+ 'end': index_mapper[end-1]+1,
300
+ 'entity': 'SCHOOL',
301
+ 'text': school_match.group(),
302
+ }
303
+ repeat = False
304
+ for o in return_obj['schools']:
305
+ if obj['start']==o['start'] and obj['end']==o['end']:
306
+ repeat = True
307
+ break
308
+ if not repeat:
309
+ obj['origin'] = text[obj['start']:obj['end']]
310
+ obj['level'] = self.schools[obj['text']]
311
+ return_obj['schools'].append(obj)
312
+ # 获取学历
313
+ for i,pattern in enumerate(self.edu_patterns):
314
+ for edu_match in pattern.finditer(remove_blanks_text):
315
+ start,end = edu_match.span()
316
+ expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
317
+ entities = self.pipelines['common'](expand_text)
318
+ for entity in entities:
319
+ if entity['entity']=='EDU' and pattern.search(entity['word']) is not None:
320
+ obj = {
321
+ 'start': index_mapper[start+entity['start']],
322
+ 'end': index_mapper[start+entity['end']-1]+1,
323
+ 'text': self.edu_index[i],
324
+ 'entity': 'EDU',
325
+ }
326
+ repeat = False
327
+ for o in return_obj['edus']:
328
+ if obj['start']==o['start'] and obj['end']==o['end']:
329
+ repeat = True
330
+ break
331
+ if not repeat:
332
+ obj['origin'] = text[obj['start']:obj['end']]
333
+ return_obj['edus'].append(obj)
334
+ # 如果有工作经历
335
+ if self.works_key_pattern.search(remove_blanks_text) is not None:
336
+ for job_time_match in self.job_time_patterns.finditer(remove_blanks_text):
337
+ origin_start,origin_end = job_time_match.span()
338
+ # convert_to_date
339
+ fr = self.to_date(job_time_match.group(1))
340
+ fs,fe = job_time_match.span(1)
341
+ to = self.to_date(job_time_match.group(3))
342
+ ts,te = job_time_match.span(3)
343
+ expand_text, start, end = self.get_expand_span(remove_blanks_text, origin_start, origin_end, max_expand_length=100)
344
+ entities = self.pipelines['common'](expand_text)
345
+ objs = []
346
+ for entity in entities:
347
+ if entity['entity']=="ORG":
348
+ obj = {
349
+ 'start': index_mapper[start+entity['start']],
350
+ 'end': index_mapper[start+entity['end']-1]+1,
351
+ 'entity': 'COMPANY',
352
+ 'text': entity['word'],
353
+ 'dis': min(
354
+ abs(origin_start-start-entity['end']+1),
355
+ abs(origin_end-start-entity['start'])
356
+ ),
357
+ }
358
+ obj['origin'] = text[obj['start']:obj['end']]
359
+ objs.append(obj)
360
+ objs.sort(key=lambda x:x['dis'])
361
+ if len(objs)>0 and self.school_pattern.search(objs[0]['text']) is None:
362
+ del objs[0]['dis']
363
+ from_date = {
364
+ 'start': index_mapper[fs],
365
+ 'end': index_mapper[fe-1]+1,
366
+ 'text': fr.isoformat(),
367
+ 'entity': 'DATE',
368
+ 'origin': text[index_mapper[fs]:index_mapper[fe-1]+1]
369
+ }
370
+ to_date = {
371
+ 'start': index_mapper[ts],
372
+ 'end': index_mapper[te-1]+1,
373
+ 'text': to.isoformat(),
374
+ 'entity': 'DATE',
375
+ 'origin': text[index_mapper[ts]:index_mapper[te-1]+1]
376
+ }
377
+ jobs = [objs[0],from_date,to_date]
378
+ return_obj['jobs'].append(jobs)
379
+ return_obj["jobs"].sort(key=lambda x:date.fromisoformat(x[1]['text']))
380
+ # 计算工作时间
381
+ last_end = None
382
+ work_month = 0
383
+ for i in range(0,len(return_obj["jobs"])):
384
+ start = date.fromisoformat(return_obj["jobs"][i][1]['text'])
385
+ end = date.fromisoformat(return_obj["jobs"][i][2]['text'])
386
+ if last_end is not None and start<last_end:
387
+ start = last_end
388
+ diff_y = end.year-start.year
389
+ diff_m = end.month-start.month
390
+ work_month += diff_y * 12 + diff_m
391
+ last_end = end
392
+ return_obj['work_time'] = math.ceil(work_month/12)
393
+ # 获取手机号码
394
+ for phone_match in self.phone_pattern.finditer(text):
395
+ start,end = phone_match.span()
396
+ return_obj['phone'].append({
397
+ 'start': start,
398
+ 'end': end,
399
+ 'entity': 'PHONE',
400
+ 'origin': text[start:end],
401
+ 'text': re.sub('\s','',text[start:end])
402
+ })
403
+ for email_match in self.email_pattern.finditer(text):
404
+ start,end = email_match.span()
405
+ return_obj['email'].append({
406
+ 'start': start,
407
+ 'end': end,
408
+ 'entity': 'EMAIL',
409
+ 'origin': text[start:end],
410
+ 'text': re.sub('\s','',text[start:end])
411
+ })
412
+ for gender_match in self.gender_pattern.finditer(text):
413
+ start,end = gender_match.span(2)
414
+ return_obj['gender'].append({
415
+ 'start': start,
416
+ 'end': end,
417
+ 'entity': 'GENDER',
418
+ 'word': text[start:end],
419
+ 'text': text[start:end]
420
+ })
421
+ for block in self.split_to_blocks(remove_blanks_text):
422
+ entities = self.pipelines["common"](block["text"])
423
+ for entity in entities:
424
+ if entity['entity']=='TITLE':
425
+ obj = {
426
+ 'start': index_mapper[block['start']+entity['start']],
427
+ 'end': index_mapper[block['start']+entity['end']-1]+1,
428
+ 'text': entity['word'],
429
+ 'entity': 'TITLE',
430
+ }
431
+ obj['origin'] = text[obj['start']:obj['end']]
432
+ repeat = False
433
+ for o in return_obj['titles']:
434
+ if obj['start']==o['start'] and obj['end']==o['end']:
435
+ repeat = True
436
+ break
437
+ if not repeat:
438
+ return_obj['titles'].append(obj)
439
+ return return_obj
440
+
441
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
442
+ return self.process(*args, **kwds)
443
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers==4.30.1
2
+ gradio==3.36.1
3
+ huggingface-hub==0.15.1
4
+ torch==2.0.1
5
+ pandas==2.0.3
6
+ requests==2.31.0
7
+ xlrd==2.0.1