freemt
commited on
Commit
·
601d149
1
Parent(s):
c90742c
Update requirements.txt
Browse files- data/test_en.txt +69 -0
- data/test_zh.txt +74 -0
- gradiobee/__init__.py +1 -0
- gradiobee/__pycache__/__init__.cpython-37.pyc +0 -0
- gradiobee/__pycache__/__init__.cpython-38.pyc +0 -0
- gradiobee/__pycache__/cmat2tset.cpython-37.pyc +0 -0
- gradiobee/__pycache__/cmat2tset.cpython-38.pyc +0 -0
- gradiobee/__pycache__/docterm_scores.cpython-37.pyc +0 -0
- gradiobee/__pycache__/docterm_scores.cpython-38.pyc +0 -0
- gradiobee/__pycache__/en2zh.cpython-37.pyc +0 -0
- gradiobee/__pycache__/en2zh.cpython-38.pyc +0 -0
- gradiobee/__pycache__/en2zh_tokens.cpython-37.pyc +0 -0
- gradiobee/__pycache__/en2zh_tokens.cpython-38.pyc +0 -0
- gradiobee/__pycache__/gen_model.cpython-37.pyc +0 -0
- gradiobee/__pycache__/gen_model.cpython-38.pyc +0 -0
- gradiobee/__pycache__/insert_spaces.cpython-37.pyc +0 -0
- gradiobee/__pycache__/insert_spaces.cpython-38.pyc +0 -0
- gradiobee/__pycache__/mdx_e2c.cpython-37.pyc +0 -0
- gradiobee/__pycache__/mdx_e2c.cpython-38.pyc +0 -0
- gradiobee/__pycache__/plot_df.cpython-38.pyc +0 -0
- gradiobee/__pycache__/smatrix.cpython-37.pyc +0 -0
- gradiobee/__pycache__/smatrix.cpython-38.pyc +0 -0
- gradiobee/cmat2tset.py +59 -0
- gradiobee/docterm_scores.py +96 -0
- gradiobee/en2zh.py +40 -0
- gradiobee/en2zh_tokens.py +28 -0
- gradiobee/gen_model.py +115 -0
- gradiobee/insert_spaces.py +14 -0
- gradiobee/mdx_dict_e2c.lzma +0 -0
- gradiobee/mdx_e2c.py +40 -0
- gradiobee/plot_df.py +98 -0
- gradiobee/smatrix.py +100 -0
- pyrightconfig.json +11 -0
- requirements.txt +7 -1
data/test_en.txt
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wuthering Heights
|
2 |
+
|
3 |
+
|
4 |
+
--------------------------------------------------------------------------------
|
5 |
+
|
6 |
+
Chapter 2
|
7 |
+
|
8 |
+
Chinese
|
9 |
+
|
10 |
+
|
11 |
+
Yesterday afternoon set in misty and cold. I had half a mind to spend it by my study fire, instead of wading through heath and mud to Wuthering Heights. On coming up from dinner, however (N.B. I dine between twelve and one o'clock; the housekeeper, a matronly lady, taken as a fixture along with the house, could not, or would not, comprehend my request that I might be served at five), on mounting the stairs with this lazy intention, and stepping into the room, I saw a servant girl on her knees surrounded by brushes and coal-scuttles, and raising an infernal dust as she extinguished the flames with heaps of cinders. This spectacle drove me back immediately; I took my hat, and, after a four-miles' walk, arrived at Heathcliff's garden gate just in time to escape the first feathery flakes of a snow shower.
|
12 |
+
|
13 |
+
On that bleak hill top the earth was hard with a black frost, and the air made me shiver through every limb. Being unable to remove the chain, I jumped over, and, running up the flagged causeway bordered with straggling gooseberry bushes, knocked vainly for admittance, till my knuckles tingled and the dogs howled.
|
14 |
+
|
15 |
+
`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn.
|
16 |
+
|
17 |
+
`Whet are ye for?' he shouted. `T' maister's dahn i' t' fowld. Go rahnd by th' end ut' laith, if yah went tuh spake tull him.'
|
18 |
+
|
19 |
+
`Is there nobody inside to open the door?' I hallooed, responsively.
|
20 |
+
|
21 |
+
`They's nobbut t' missis; and shoo'll nut oppen't an ye mak yer flaysome dins till neeght.'
|
22 |
+
|
23 |
+
`Why? Cannot you tell her who I am, eh, Joseph?'
|
24 |
+
|
25 |
+
`Nor-ne me! Aw'll hae noa hend wi't,' muttered the head, vanishing.
|
26 |
+
|
27 |
+
The snow began to drive thickly. I seized the handle to essay another trial; when a young man without coat, and shouldering a pitchfork, appeared in the yard behind. He hailed me to follow him, and, after marching through a wash-house, and a paved area containing a coal shed, pump, and pigeon cot, we at length arrived in the huge, warm, cheerful apartment, where I was formerly received. It glowed delightfully in the radiance of an immense fire, compounded of coal, peat, and wood; and near the table, laid for a plentiful evening meal, I was pleased to observe the `missis', an individual whose existence I had never previously suspected. I bowed and waited, thinking she would bid me take a seat. She looked at me, leaning back in her chair, and remained motionless and mute.
|
28 |
+
|
29 |
+
`Rough weather!' I remarked. `I'm afraid, Mrs Heathcliff, the door must bear the consequence of your servants' leisure attendance: I had hard work to make them hear me.'
|
30 |
+
|
31 |
+
She never opened her mouth. I stared--she stared also: at any rate, she kept her eyes on me in a cool, regardless manner, exceedingly embarrassing and disagreeable.
|
32 |
+
|
33 |
+
`Sit down,' said the young man gruffly. `He'll be in soon.'
|
34 |
+
|
35 |
+
I obeyed; and hemmed, and called the villain Juno, who deigned, at this second interview, to move the extreme tip of her tail, in token of owning my acquaintance.
|
36 |
+
|
37 |
+
`A beautiful animal!' I commenced again. `Do you intend parting with the little ones, madam?'
|
38 |
+
|
39 |
+
`They are not mine,' said the amiable hostess, more repellingly than Heathcliff himself could have replied.
|
40 |
+
|
41 |
+
`Ah, your favourites are among these?' I continued, turning to an obscure cushion full of something like cats.
|
42 |
+
|
43 |
+
`A strange choice of favourites!' she observed scornfully.
|
44 |
+
|
45 |
+
Unluckily, it was a heap of dead rabbits. I hemmed once more, and drew closer to the hearth, repeating my comment on the wildness of the evening.
|
46 |
+
|
47 |
+
`You should not have come out,' she said, rising and reaching from the chimney-piece two of the painted canisters.
|
48 |
+
|
49 |
+
Her position before was sheltered from the light; now, I had a distinct view of her whole figure and countenance. She was slender, and apparently scarcely past girlhood: an admirable form, and the most exquisite little face that I have ever had the pleasure of beholding; small features, very fair; flaxen ringlets, or rather golden, hanging loose on her delicate neck; and eyes, had they been agreeable in expression, they would have been irresistible: fortunately for my susceptible heart, the only sentiment they evinced hovered between scorn, and a kind of desperation, singularly unnatural to be detected there. The canisters were almost out of her reach; I made a motion to aid her; she turned upon me as a miser might turn if anyone attempted to assist him in counting his gold.
|
50 |
+
|
51 |
+
`I don't want your help,' she snapped; `I can get them for myself.'
|
52 |
+
|
53 |
+
`I beg your pardon!' I hastened to reply.
|
54 |
+
|
55 |
+
`Were you asked to tea?' she demanded, tying an apron over her neat black frock, and standing with a spoonful of the leaf poised over the pot.
|
56 |
+
|
57 |
+
`I shall be glad to have a cup,' I answered.
|
58 |
+
|
59 |
+
`Were you asked?' she repeated.
|
60 |
+
|
61 |
+
`No,' I said, half smiling. `You are the proper person to ask me.'
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
Contents PreviousChapter
|
66 |
+
NextChapter
|
67 |
+
|
68 |
+
|
69 |
+
Homepage
|
data/test_zh.txt
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
呼啸山庄
|
2 |
+
|
3 |
+
--------------------------------------------------------------------------------
|
4 |
+
|
5 |
+
第二章
|
6 |
+
|
7 |
+
英文
|
8 |
+
|
9 |
+
|
10 |
+
昨天下午又冷又有雾。我想就在书房炉边消磨一下午,不想踩着杂草污泥到呼啸山庄了。
|
11 |
+
|
12 |
+
但是,吃过午饭(注意——我在十二点与一点钟之间吃午饭,而可以当作这所房子的附属物的管家婆,一位慈祥的太太却不能,或者并不愿理解我请求在五点钟开饭的用意),在我怀着这个懒惰的想法上了楼,迈进屋子的时候,看见一个女仆跪在地上,身边是扫帚和煤斗。她正在用一堆堆煤渣封火,搞起一片弥漫的灰尘。这景象立刻把我赶回头了。我拿了帽子,走了四里路,到达了希刺克厉夫的花园口口,刚好躲过了一场今年初降的鹅毛大雪。
|
13 |
+
|
14 |
+
在那荒凉的山顶上,土地由于结了一层黑冰而冻得坚硬,冷空气使我四肢发抖。我弄不开门链,就跳进去,顺着两边种着蔓延的醋栗树丛的石路跑去。我白白地敲了半天门,一直敲到我的手指骨都痛了,狗也狂吠起来。
|
15 |
+
|
16 |
+
“倒霉的人家!”我心里直叫,“只为你这样无礼待客,就该一辈子跟人群隔离。我至少还不会在白天把门闩住。我才不管呢——我要进去!”如此决定了。我就抓住门闩,使劲摇它。苦脸的约瑟夫从谷仓的一个圆窗里探出头来。
|
17 |
+
|
18 |
+
“你干吗?”他大叫。“主人在牛栏里,你要是找他说话,就从这条路口绕过去。”
|
19 |
+
|
20 |
+
“屋里没人开门吗?”我也叫起来。
|
21 |
+
|
22 |
+
“除了太太没有别人。你就是闹腾到夜里,她也不会开。”
|
23 |
+
|
24 |
+
“为什么?你就不能告诉她我是谁吗,呃,约瑟夫?”
|
25 |
+
|
26 |
+
“别找我!我才不管这些闲事呢,”这个脑袋咕噜着,又不见了。
|
27 |
+
|
28 |
+
雪开始下大了。我握住门柄又试一回。这时一个没穿外衣的年轻人,扛着一根草耙,在后面院子里出现了。他招呼我跟着他走,穿过了一个洗衣房和一片铺平的地,那儿有煤棚、抽水机和鸽笼,我们终于到了我上次被接待过的那间温暖的、热闹的大屋子。煤、炭和木材混合在一起燃起的熊熊炉火,使这屋子放着光彩。在准备摆上丰盛晚餐的桌旁,我很高兴地看到了那位“太太”,以前我从未料想到会有这么一个人存在的。我鞠躬等候,以为她会叫我坐下。她望望我,往她的椅背一靠,不动,也不出声。
|
29 |
+
|
30 |
+
“天气真坏!”我说,“希刺克厉夫太太,恐怕大门因为您的仆人偷懒而大吃苦头,我费了好大劲才使他们听见我敲门!”
|
31 |
+
|
32 |
+
她死不开口。我瞪眼——她也瞪眼。反正她总是以一种冷冷的、漠不关心的神气盯住我,使人十分窘,而且不愉快。
|
33 |
+
|
34 |
+
“坐下吧,”那年轻人粗声粗气地说,“他就要来了。”
|
35 |
+
|
36 |
+
我服从了;轻轻咳了一下,叫唤那恶狗朱诺。临到第二次会面,它总算赏脸,摇起尾巴尖,表示认我是熟人了。
|
37 |
+
|
38 |
+
“好漂亮的狗!”我又开始说话。“您是不是打算不要这些小的呢,夫人?”
|
39 |
+
|
40 |
+
“那些不是我的,”这可爱可亲的女主人说,比希刺克厉夫本人所能回答的腔调还要更冷淡些。
|
41 |
+
|
42 |
+
“啊,您所心爱的是在这一堆里啦!”我转身指着一个看不清楚的靠垫上那一堆像猫似的东西,接着说下去。
|
43 |
+
|
44 |
+
“谁会爱这些东西那才怪呢!”她轻蔑地说。
|
45 |
+
|
46 |
+
倒霉,原来那是堆死兔子。我又轻咳一声,向火炉凑近些,又把今晚天气不好的话评论一通。
|
47 |
+
|
48 |
+
“你本来就不该出来。”她说,站起来去拿壁炉台上的两个彩色茶叶罐。
|
49 |
+
|
50 |
+
她原先坐在光线被遮住的地方,现在我把她的全身和面貌都看得清清楚楚。她苗条,显然还没有过青春期。挺好看的体态,还有一张我生平从未有幸见过的绝妙的小脸蛋。五官纤丽,非常漂亮。淡黄色的卷发,或者不如说是金黄色的,松松地垂在她那细嫩的颈上。至于眼睛,要是眼神能显得和悦些,就要使人无法抗拒了。对我这容易动情的心说来倒是常事,因为它们所表现的只是在轻蔑与近似绝望之间的一种情绪,而在那张脸上看见那样的眼神是特别不自然的。
|
51 |
+
|
52 |
+
她简直够不到茶叶罐。我动了一动,想帮她一下。她猛地扭转身向我,像守财奴看见别人打算帮他数他的金子一样。
|
53 |
+
|
54 |
+
“我不要你帮忙,”她怒气冲冲地说,“我自己拿得到。”
|
55 |
+
|
56 |
+
“对不起!”我连忙回答。
|
57 |
+
|
58 |
+
“是请你来吃茶的吗?”她问,把一条围裙系在她那干净的黑衣服上,就这样站着,拿一匙茶叶正要往茶壶里倒。
|
59 |
+
|
60 |
+
“我很想喝杯茶。”我回答。
|
61 |
+
|
62 |
+
“是请你来的吗?”她又问。
|
63 |
+
|
64 |
+
“没有,”我说,勉强笑一笑。“您正好请我喝茶。”
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
目录
|
70 |
+
上一章
|
71 |
+
下一章
|
72 |
+
|
73 |
+
|
74 |
+
返回首页
|
gradiobee/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Init."""
|
gradiobee/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (179 Bytes). View file
|
|
gradiobee/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (131 Bytes). View file
|
|
gradiobee/__pycache__/cmat2tset.cpython-37.pyc
ADDED
Binary file (1.56 kB). View file
|
|
gradiobee/__pycache__/cmat2tset.cpython-38.pyc
ADDED
Binary file (1.51 kB). View file
|
|
gradiobee/__pycache__/docterm_scores.cpython-37.pyc
ADDED
Binary file (2.64 kB). View file
|
|
gradiobee/__pycache__/docterm_scores.cpython-38.pyc
ADDED
Binary file (2.61 kB). View file
|
|
gradiobee/__pycache__/en2zh.cpython-37.pyc
ADDED
Binary file (989 Bytes). View file
|
|
gradiobee/__pycache__/en2zh.cpython-38.pyc
ADDED
Binary file (947 Bytes). View file
|
|
gradiobee/__pycache__/en2zh_tokens.cpython-37.pyc
ADDED
Binary file (1.1 kB). View file
|
|
gradiobee/__pycache__/en2zh_tokens.cpython-38.pyc
ADDED
Binary file (1.06 kB). View file
|
|
gradiobee/__pycache__/gen_model.cpython-37.pyc
ADDED
Binary file (4.7 kB). View file
|
|
gradiobee/__pycache__/gen_model.cpython-38.pyc
ADDED
Binary file (4.67 kB). View file
|
|
gradiobee/__pycache__/insert_spaces.cpython-37.pyc
ADDED
Binary file (646 Bytes). View file
|
|
gradiobee/__pycache__/insert_spaces.cpython-38.pyc
ADDED
Binary file (602 Bytes). View file
|
|
gradiobee/__pycache__/mdx_e2c.cpython-37.pyc
ADDED
Binary file (945 Bytes). View file
|
|
gradiobee/__pycache__/mdx_e2c.cpython-38.pyc
ADDED
Binary file (901 Bytes). View file
|
|
gradiobee/__pycache__/plot_df.cpython-38.pyc
ADDED
Binary file (2.33 kB). View file
|
|
gradiobee/__pycache__/smatrix.cpython-37.pyc
ADDED
Binary file (2.68 kB). View file
|
|
gradiobee/__pycache__/smatrix.cpython-38.pyc
ADDED
Binary file (2.65 kB). View file
|
|
gradiobee/cmat2tset.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gen triple-set from a matrix."""
|
2 |
+
from typing import List, Tuple, Union
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
|
8 |
+
# fmt: off
|
9 |
+
def cmat2tset(
|
10 |
+
cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
|
11 |
+
# thirdcol: bool = True
|
12 |
+
# ) -> List[Union[Tuple[int, int], Tuple[int, int, float]]]:
|
13 |
+
) -> np.ndarray:
|
14 |
+
# fmt: on
|
15 |
+
"""Gen triple-set from a matrix.
|
16 |
+
|
17 |
+
Args
|
18 |
+
cmat: 2d-array or list, correlation or other metric matrix
|
19 |
+
# thirdcol: bool, whether to output a third column (max value)
|
20 |
+
|
21 |
+
Returns
|
22 |
+
Obtain the max and argmax for each column, erase the row afterwards to eliminate one single row that would dominate
|
23 |
+
every column.
|
24 |
+
"""
|
25 |
+
# if isinstance(cmat, list):
|
26 |
+
cmat = np.array(cmat1)
|
27 |
+
|
28 |
+
if not np.prod(cmat.shape):
|
29 |
+
raise SystemError("data not 2d...")
|
30 |
+
|
31 |
+
_ = """
|
32 |
+
# y00 = range(cmat.shape[1]) # cmat.shape[0] long time wasting bug
|
33 |
+
|
34 |
+
yargmax = cmat.argmax(axis=0)
|
35 |
+
if thirdcol:
|
36 |
+
ymax = cmat.max(axis=0)
|
37 |
+
|
38 |
+
res = [*zip(y00, yargmax, ymax)] # type: ignore
|
39 |
+
# to unzip
|
40 |
+
# a, b, c = zip(*res)
|
41 |
+
|
42 |
+
return res
|
43 |
+
|
44 |
+
_ = [*zip(y00, yargmax)] # type: ignore
|
45 |
+
return _
|
46 |
+
"""
|
47 |
+
low_ = cmat.min() - 1
|
48 |
+
argmax_max = []
|
49 |
+
src_len, tgt_len = cmat.shape
|
50 |
+
for _ in range(min(src_len, tgt_len)):
|
51 |
+
argmax = int(cmat.argmax())
|
52 |
+
row, col = divmod(argmax, tgt_len)
|
53 |
+
argmax_max.append([col, row, cmat.max()])
|
54 |
+
|
55 |
+
# erase row-th row and col-th col of cmat
|
56 |
+
cmat[row, :] = low_
|
57 |
+
cmat[:, col] = low_
|
58 |
+
|
59 |
+
return np.array(argmax_max)
|
gradiobee/docterm_scores.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Generate a doc-term score matrix based on textacy.representation.Vectorizer.
|
2 |
+
|
3 |
+
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
+
"""
|
5 |
+
from typing import Dict, Iterable, List, Optional, Union
|
6 |
+
import numpy as np
|
7 |
+
from itertools import chain
|
8 |
+
from psutil import virtual_memory
|
9 |
+
from more_itertools import ilen
|
10 |
+
|
11 |
+
from textacy.representations import Vectorizer
|
12 |
+
from logzero import logger
|
13 |
+
|
14 |
+
from gradiobee.gen_model import gen_model
|
15 |
+
|
16 |
+
|
17 |
+
# fmt: off
|
18 |
+
def docterm_scores(
|
19 |
+
doc1: Iterable[Iterable[str]], # List[List[str]],
|
20 |
+
doc2: Iterable[Iterable[str]],
|
21 |
+
model: Vectorizer = None,
|
22 |
+
tf_type: str = 'linear',
|
23 |
+
idf_type: Optional[str] = "smooth",
|
24 |
+
# dl_type: Optional[str] = "sqrt", # "lucene-style tfidf"
|
25 |
+
dl_type: Optional[str] = None, #
|
26 |
+
norm: Optional[str] = "l2", # + "l2"
|
27 |
+
min_df: Union[int, float] = 1,
|
28 |
+
max_df: Union[int, float] = 1.0,
|
29 |
+
max_n_terms: Optional[int] = None,
|
30 |
+
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
|
31 |
+
) -> np.ndarray:
|
32 |
+
# fmt: on
|
33 |
+
"""Generate a doc-term score matrix based on textacy.representation.Vectorizer.
|
34 |
+
|
35 |
+
Args
|
36 |
+
doc1: tokenized doc of n1
|
37 |
+
doc2: tokenized doc of n2
|
38 |
+
model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf").
|
39 |
+
rest: refer to textacy.representation.Vectorizer
|
40 |
+
Attributes
|
41 |
+
vectorizer
|
42 |
+
|
43 |
+
Returns
|
44 |
+
n1 x n2 similarity matrix of float numbers
|
45 |
+
"""
|
46 |
+
# make sure doc1/doc2 is of the right typing
|
47 |
+
try:
|
48 |
+
for xelm in iter(doc1):
|
49 |
+
for elm in iter(xelm):
|
50 |
+
assert isinstance(elm, str)
|
51 |
+
except AssertionError:
|
52 |
+
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
|
53 |
+
except Exception as e:
|
54 |
+
logger.error(e)
|
55 |
+
raise
|
56 |
+
try:
|
57 |
+
for xelm in iter(doc2):
|
58 |
+
for elm in iter(xelm):
|
59 |
+
assert isinstance(elm, str)
|
60 |
+
except AssertionError:
|
61 |
+
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
|
62 |
+
except Exception as e:
|
63 |
+
logger.error(e)
|
64 |
+
raise
|
65 |
+
|
66 |
+
if model is None:
|
67 |
+
model = gen_model(
|
68 |
+
[*chain(doc1, doc2)],
|
69 |
+
tf_type=tf_type,
|
70 |
+
idf_type=idf_type,
|
71 |
+
dl_type=dl_type,
|
72 |
+
norm=norm,
|
73 |
+
min_df=min_df,
|
74 |
+
max_df=max_df,
|
75 |
+
max_n_terms=max_n_terms,
|
76 |
+
vocabulary_terms=vocabulary_terms
|
77 |
+
)
|
78 |
+
docterm_scores.model = model
|
79 |
+
|
80 |
+
# a1 = dt.toarray(), a2 = doc_term_matrix.toarray()
|
81 |
+
# np.all(np.isclose(a1, a2))
|
82 |
+
|
83 |
+
dt1 = model.transform(doc1)
|
84 |
+
dt2 = model.transform(doc2)
|
85 |
+
|
86 |
+
# virtual_memory().available / 8: 64bits float
|
87 |
+
require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8
|
88 |
+
if require_ram > virtual_memory().available:
|
89 |
+
logger.warning("virtual_memory().available: %s", virtual_memory().available)
|
90 |
+
logger.warning("memory required: %s", require_ram)
|
91 |
+
|
92 |
+
if require_ram > virtual_memory().available * 10:
|
93 |
+
logger.warning("You'll likely encounter memory problem, such as slow down response and/or OOM.")
|
94 |
+
|
95 |
+
# return dt1.doc(dt2.T)
|
96 |
+
return dt2.toarray().dot(dt1.toarray().T)
|
gradiobee/en2zh.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Translate english to chinese via a dict."""
|
2 |
+
from typing import List, Union
|
3 |
+
|
4 |
+
import warnings
|
5 |
+
|
6 |
+
import copy
|
7 |
+
from gradiobee.mdx_e2c import mdx_e2c
|
8 |
+
|
9 |
+
warnings.simplefilter('ignore', DeprecationWarning)
|
10 |
+
|
11 |
+
|
12 |
+
# fmt: off
|
13 |
+
def en2zh(
|
14 |
+
# text: Union[str, List[List[str]]],
|
15 |
+
text: Union[str, List[str]],
|
16 |
+
) -> List[str]:
|
17 |
+
# fmt: on
|
18 |
+
"""Translate english to chinese via a dict.
|
19 |
+
|
20 |
+
Args
|
21 |
+
text: to translate, list of str
|
22 |
+
|
23 |
+
Returns
|
24 |
+
res: list of str
|
25 |
+
"""
|
26 |
+
res = copy.deepcopy(text)
|
27 |
+
if isinstance(text, str):
|
28 |
+
# res = [text.split()]
|
29 |
+
res = [text]
|
30 |
+
|
31 |
+
# if res and isinstance(res[0], str):
|
32 |
+
# res = [line.lower().split() for line in res]
|
33 |
+
|
34 |
+
# res = ["".join([word_tr(word) for word in line]) for line in res]
|
35 |
+
_ = []
|
36 |
+
for line in res:
|
37 |
+
line_tr = [mdx_e2c(word) for word in line.split()]
|
38 |
+
_.append("".join(line_tr))
|
39 |
+
|
40 |
+
return _
|
gradiobee/en2zh_tokens.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Translate english to chinese via a dict."""
|
2 |
+
from typing import List, Union
|
3 |
+
|
4 |
+
from gradiobee.en2zh import en2zh
|
5 |
+
from gradiobee.insert_spaces import insert_spaces
|
6 |
+
|
7 |
+
|
8 |
+
# fmt: off
|
9 |
+
def en2zh_tokens(
|
10 |
+
# text: Union[str, List[List[str]]],
|
11 |
+
text: Union[str, List[str]],
|
12 |
+
dedup: bool = True,
|
13 |
+
) -> List[List[str]]:
|
14 |
+
# fmt: on
|
15 |
+
"""Translate english to chinese tokens via a dict.
|
16 |
+
|
17 |
+
Args
|
18 |
+
text: to translate, list of str
|
19 |
+
dedup: if True, remove all duplicates
|
20 |
+
Returns
|
21 |
+
res: list of list of str/token/char
|
22 |
+
"""
|
23 |
+
res = en2zh(text)
|
24 |
+
|
25 |
+
if dedup:
|
26 |
+
return [list(set(insert_spaces(elm).split())) for elm in res]
|
27 |
+
|
28 |
+
return [insert_spaces(elm).split() for elm in res]
|
gradiobee/gen_model.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Generate a model (textacy.representations.Vectorizer).
|
2 |
+
|
3 |
+
vectorizer = Vectorizer(
|
4 |
+
tf_type="linear", idf_type="smooth", norm="l2",
|
5 |
+
min_df=3, max_df=0.95)
|
6 |
+
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
|
7 |
+
doc_term_matrix
|
8 |
+
|
9 |
+
tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
|
10 |
+
"""
|
11 |
+
from typing import Dict, Iterable, List, Optional, Union
|
12 |
+
|
13 |
+
from textacy.representations import Vectorizer
|
14 |
+
from logzero import logger
|
15 |
+
|
16 |
+
|
17 |
+
# fmt: off
|
18 |
+
def gen_model(
|
19 |
+
tokenized_docs: Iterable[Iterable[str]], # List[List[str]],
|
20 |
+
tf_type: str = 'linear',
|
21 |
+
idf_type: Optional[str] = "smooth",
|
22 |
+
dl_type: str = None, # Optional[str] = "sqrt" “lucene-style tfidf”
|
23 |
+
norm: Optional[str] = "l2", # + "l2"
|
24 |
+
min_df: Union[int, float] = 1,
|
25 |
+
max_df: Union[int, float] = 1.0,
|
26 |
+
max_n_terms: Optional[int] = None,
|
27 |
+
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
|
28 |
+
) -> Vectorizer:
|
29 |
+
# fmt: on
|
30 |
+
"""Generate a model (textacy.representations.Vectorizer).
|
31 |
+
|
32 |
+
Args:
|
33 |
+
doc: tokenized docs
|
34 |
+
|
35 |
+
(refer to textacy.representation.Vectorizer)
|
36 |
+
tf_type: Type of term frequency (tf) to use for weights' local component:
|
37 |
+
|
38 |
+
- "linear": tf (tfs are already linear, so left as-is)
|
39 |
+
- "sqrt": tf => sqrt(tf)
|
40 |
+
- "log": tf => log(tf) + 1
|
41 |
+
- "binary": tf => 1
|
42 |
+
|
43 |
+
idf_type: Type of inverse document frequency (idf) to use for weights'
|
44 |
+
global component:
|
45 |
+
|
46 |
+
- "standard": idf = log(n_docs / df) + 1.0
|
47 |
+
- "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added
|
48 |
+
to all document frequencies, as if a single document containing
|
49 |
+
every unique term was added to the corpus.
|
50 |
+
- "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is
|
51 |
+
a form commonly used in information retrieval that allows for
|
52 |
+
very common terms to receive negative weights.
|
53 |
+
- None: no global weighting is applied to local term weights.
|
54 |
+
|
55 |
+
dl_type: Type of document-length scaling to use for weights'
|
56 |
+
normalization component:
|
57 |
+
|
58 |
+
- "linear": dl (dls are already linear, so left as-is)
|
59 |
+
- "sqrt": dl => sqrt(dl)
|
60 |
+
- "log": dl => log(dl)
|
61 |
+
- None: no normalization is applied to local(*global?) weights
|
62 |
+
|
63 |
+
norm: If "l1" or "l2", normalize weights by the L1 or L2 norms, respectively,
|
64 |
+
of row-wise vectors; otherwise, don't.
|
65 |
+
min_df: Minimum number of documents in which a term must appear for it to be
|
66 |
+
included in the vocabulary and as a column in a transformed doc-term matrix.
|
67 |
+
If float, value is the fractional proportion of the total number of docs,
|
68 |
+
which must be in [0.0, 1.0]; if int, value is the absolute number.
|
69 |
+
max_df: Maximum number of documents in which a term may appear for it to be
|
70 |
+
included in the vocabulary and as a column in a transformed doc-term matrix.
|
71 |
+
If float, value is the fractional proportion of the total number of docs,
|
72 |
+
which must be in [0.0, 1.0]; if int, value is the absolute number.
|
73 |
+
max_n_terms: If specified, only include terms whose document frequency is within
|
74 |
+
the top ``max_n_terms``.
|
75 |
+
vocabulary_terms: Mapping of unique term string to unique term id, or
|
76 |
+
an iterable of term strings that gets converted into such a mapping.
|
77 |
+
Note that, if specified, vectorized outputs will include *only* these terms.
|
78 |
+
|
79 |
+
“lucene-style tfidf”: Adds a doc-length normalization to the usual local and global components.
|
80 |
+
Params: tf_type="linear", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="sqrt"
|
81 |
+
|
82 |
+
“lucene-style bm25”: Uses a smoothed idf instead of the classic bm25 variant to prevent weights on terms from going negative.
|
83 |
+
Params: tf_type="bm25", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="linear"
|
84 |
+
Attributes:
|
85 |
+
doc_term_matrix
|
86 |
+
Returns:
|
87 |
+
transform_fit'ted vectorizer
|
88 |
+
"""
|
89 |
+
# make sure tokenized_docs is the right typing
|
90 |
+
try:
|
91 |
+
for xelm in iter(tokenized_docs):
|
92 |
+
for elm in iter(xelm):
|
93 |
+
assert isinstance(elm, str)
|
94 |
+
except AssertionError:
|
95 |
+
raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ")
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(e)
|
98 |
+
raise
|
99 |
+
|
100 |
+
vectorizer = Vectorizer(
|
101 |
+
# tf_type="linear", idf_type="smooth", norm="l2", min_df=3, max_df=0.95)
|
102 |
+
tf_type=tf_type,
|
103 |
+
idf_type=idf_type,
|
104 |
+
dl_type=dl_type,
|
105 |
+
norm=norm,
|
106 |
+
min_df=min_df,
|
107 |
+
max_df=max_df,
|
108 |
+
max_n_terms=max_n_terms,
|
109 |
+
vocabulary_terms=vocabulary_terms
|
110 |
+
)
|
111 |
+
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
|
112 |
+
|
113 |
+
gen_model.doc_term_matrix = doc_term_matrix
|
114 |
+
|
115 |
+
return vectorizer
|
gradiobee/insert_spaces.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Insert spaces, mypython/split_chinese.py."""
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
def insert_spaces(text: str) -> str:
|
6 |
+
"""Insert space in Chinese characters.
|
7 |
+
|
8 |
+
>>> insert_spaces("test亨利it四世上")
|
9 |
+
' test 亨 利 it 四 世 上 '
|
10 |
+
>>> insert_spaces("test亨利it四世上").strip().__len__()
|
11 |
+
17
|
12 |
+
|
13 |
+
"""
|
14 |
+
return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
|
gradiobee/mdx_dict_e2c.lzma
ADDED
Binary file (1.18 MB). View file
|
|
gradiobee/mdx_e2c.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Load mdx_dict_e2c c2e.
|
2 |
+
|
3 |
+
mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
|
4 |
+
mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
|
5 |
+
"""
|
6 |
+
from pathlib import Path
|
7 |
+
from string import punctuation
|
8 |
+
import joblib
|
9 |
+
|
10 |
+
# keep "-"
|
11 |
+
punctuation = punctuation.replace("-", "")
|
12 |
+
c_dir = Path(__file__).parent
|
13 |
+
|
14 |
+
# lazy load in __init__.py like this?
|
15 |
+
# mdx_dict_e2c = importlib.import_module("mdx_dict_e2c")
|
16 |
+
# mdx_e2c = mdx_dict_e2c.mdx_e2c
|
17 |
+
# mdx_dict_c2e = importlib.import_module("mdx_dict_c2e")
|
18 |
+
# mdx_c2e = mdx_dict_c2e.mdx_c2e
|
19 |
+
|
20 |
+
mdx_dict_e2c = joblib.load(c_dir / "mdx_dict_e2c.lzma")
|
21 |
+
print("e2c lzma file loaded")
|
22 |
+
|
23 |
+
# memory = joblib.Memory("joblibcache", verbose=0)
|
24 |
+
|
25 |
+
|
26 |
+
# @memory.cache # no need, mdx_dict_e2c in RAM
|
27 |
+
def mdx_e2c(word: str) -> str:
|
28 |
+
"""Fetch definition for word.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
word: word to look up
|
32 |
+
Returns:
|
33 |
+
definition entry or word itself
|
34 |
+
>>> mdx_e2c("do").__len__()
|
35 |
+
43
|
36 |
+
>>> mdx_e2c("我").strip()
|
37 |
+
'我'
|
38 |
+
"""
|
39 |
+
word = word.strip(punctuation + " \t\n\r")
|
40 |
+
return mdx_dict_e2c.get(word.lower(), word)
|
gradiobee/plot_df.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Plot pandas.DataFrame with DBSCAN clustering."""
|
2 |
+
# pylint: disable=invalid-name, too-many-arguments
|
3 |
+
# import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
from sklearn.cluster import DBSCAN
|
8 |
+
|
9 |
+
from logzero import logger
|
10 |
+
|
11 |
+
# turn interactive when in ipython session
|
12 |
+
if "get_ipython" in globals():
|
13 |
+
plt.ion()
|
14 |
+
|
15 |
+
|
16 |
+
# fmt: off
|
17 |
+
def plot_df(
|
18 |
+
df_: pd.DataFram,
|
19 |
+
min_samples: int = 6,
|
20 |
+
eps: float = 10,
|
21 |
+
ylim: int = None,
|
22 |
+
xlabel: str = "en",
|
23 |
+
ylabel: str = "zh",
|
24 |
+
) -> plt:
|
25 |
+
# fmt: on
|
26 |
+
"""Plot df with DBSCAN clustering.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
|
30 |
+
Returns:
|
31 |
+
matplotlib.pyplot: for possible use in gradio
|
32 |
+
|
33 |
+
plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
|
34 |
+
df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])
|
35 |
+
|
36 |
+
# sort 'x', axis 0 changes, index regenerated
|
37 |
+
df_s = df_.sort_values('x', axis=0, ignore_index=True)
|
38 |
+
|
39 |
+
# sorintg does not seem to impact clustering
|
40 |
+
DBSCAN(1.5, min_samples=3).fit(df_).labels_
|
41 |
+
DBSCAN(1.5, min_samples=3).fit(df_s).labels_
|
42 |
+
|
43 |
+
"""
|
44 |
+
df_ = pd.DataFrame(df_)
|
45 |
+
if df_.columns.__len__() < 3:
|
46 |
+
logger.error(
|
47 |
+
"expected 3 columns DataFram, got: %s, cant proceed, returninng None",
|
48 |
+
df_.columns.tolist(),
|
49 |
+
)
|
50 |
+
return None
|
51 |
+
|
52 |
+
# take first three columns
|
53 |
+
columns = df_.columns[:3]
|
54 |
+
df_ = df_[columns]
|
55 |
+
|
56 |
+
# rename columns to "x", "y", "cos"
|
57 |
+
df_.columns = ["x", "y", "cos"]
|
58 |
+
|
59 |
+
sns.set()
|
60 |
+
sns.set_style("darkgrid")
|
61 |
+
fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27))
|
62 |
+
fig.suptitle("alignment projection")
|
63 |
+
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
|
64 |
+
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
|
65 |
+
|
66 |
+
# ax0.scatter(df_[_].x, df_[_].y, marker='o', c='g', alpha=0.5)
|
67 |
+
# ax0.grid()
|
68 |
+
# print("ratio: %.2f%%" % (100 * sum(_)/len(df_)))
|
69 |
+
|
70 |
+
df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)
|
71 |
+
|
72 |
+
# clustered
|
73 |
+
df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)
|
74 |
+
|
75 |
+
# outliers
|
76 |
+
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
|
77 |
+
|
78 |
+
# ax0.set_xlabel("")
|
79 |
+
# ax0.set_ylabel("zh")
|
80 |
+
ax0.set_xlabel("")
|
81 |
+
ax0.set_ylabel(ylabel)
|
82 |
+
xlim = len(df_)
|
83 |
+
ax0.set_xlim(0, xlim)
|
84 |
+
if ylim:
|
85 |
+
ax0.set_ylim(0, ylim)
|
86 |
+
ax0.set_title("max similarity along columns (outliers denoted by 'x')")
|
87 |
+
|
88 |
+
# ax1.set_xlabel("en")
|
89 |
+
# ax1.set_ylabel("zh")
|
90 |
+
ax1.set_xlabel(xlabel)
|
91 |
+
ax1.set_ylabel(ylabel)
|
92 |
+
|
93 |
+
ax1.set_xlim(0, xlim)
|
94 |
+
if ylim:
|
95 |
+
ax1.set_ylim(0, ylim)
|
96 |
+
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})")
|
97 |
+
|
98 |
+
return plt
|
gradiobee/smatrix.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Generate a similarity matrix (doc-term score matrix) based on textacy.representation.Vectorizer.
|
2 |
+
|
3 |
+
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
+
originally docterm_scores.py.
|
5 |
+
"""
|
6 |
+
from typing import Dict, Iterable, List, Optional, Union
|
7 |
+
import numpy as np
|
8 |
+
from itertools import chain
|
9 |
+
from psutil import virtual_memory
|
10 |
+
from more_itertools import ilen
|
11 |
+
|
12 |
+
from textacy.representations import Vectorizer
|
13 |
+
# from textacy.representations.vectorizers import Vectorizer
|
14 |
+
from logzero import logger
|
15 |
+
|
16 |
+
# from smatrix.gen_model import gen_model
|
17 |
+
from gradiobee.gen_model import gen_model
|
18 |
+
|
19 |
+
|
20 |
+
# fmt: off
|
21 |
+
def smatrix(
|
22 |
+
doc1: Iterable[Iterable[str]], # List[List[str]],
|
23 |
+
doc2: Iterable[Iterable[str]],
|
24 |
+
model: Vectorizer = None,
|
25 |
+
tf_type: str = 'linear',
|
26 |
+
idf_type: Optional[str] = "smooth",
|
27 |
+
# dl_type: Optional[str] = "sqrt", # "lucene-style tfidf"
|
28 |
+
dl_type: Optional[str] = None, #
|
29 |
+
norm: Optional[str] = "l2", # + "l2"
|
30 |
+
min_df: Union[int, float] = 1,
|
31 |
+
max_df: Union[int, float] = 1.0,
|
32 |
+
max_n_terms: Optional[int] = None,
|
33 |
+
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
|
34 |
+
) -> np.ndarray:
|
35 |
+
# fmt: on
|
36 |
+
"""Generate a doc-term score matrix based on textacy.representation.Vectorizer.
|
37 |
+
|
38 |
+
Args
|
39 |
+
doc1: tokenized doc of n1
|
40 |
+
doc2: tokenized doc of n2
|
41 |
+
model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf").
|
42 |
+
rest: refer to textacy.representation.Vectorizer
|
43 |
+
Attributes
|
44 |
+
vectorizer
|
45 |
+
|
46 |
+
Returns
|
47 |
+
n1 x n2 similarity matrix of float numbers
|
48 |
+
"""
|
49 |
+
# make sure doc1/doc2 is of the right typing
|
50 |
+
try:
|
51 |
+
for xelm in iter(doc1):
|
52 |
+
for elm in iter(xelm):
|
53 |
+
assert isinstance(elm, str)
|
54 |
+
except AssertionError:
|
55 |
+
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
|
56 |
+
except Exception as e:
|
57 |
+
logger.error(e)
|
58 |
+
raise
|
59 |
+
try:
|
60 |
+
for xelm in iter(doc2):
|
61 |
+
for elm in iter(xelm):
|
62 |
+
assert isinstance(elm, str)
|
63 |
+
except AssertionError:
|
64 |
+
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(e)
|
67 |
+
raise
|
68 |
+
|
69 |
+
if model is None:
|
70 |
+
model = gen_model(
|
71 |
+
[*chain(doc1, doc2)],
|
72 |
+
tf_type=tf_type,
|
73 |
+
idf_type=idf_type,
|
74 |
+
dl_type=dl_type,
|
75 |
+
norm=norm,
|
76 |
+
min_df=min_df,
|
77 |
+
max_df=max_df,
|
78 |
+
max_n_terms=max_n_terms,
|
79 |
+
vocabulary_terms=vocabulary_terms
|
80 |
+
)
|
81 |
+
# docterm_scores.model = model
|
82 |
+
smatrix.model = model
|
83 |
+
|
84 |
+
# a1 = dt.toarray(), a2 = doc_term_matrix.toarray()
|
85 |
+
# np.all(np.isclose(a1, a2))
|
86 |
+
|
87 |
+
dt1 = model.transform(doc1)
|
88 |
+
dt2 = model.transform(doc2)
|
89 |
+
|
90 |
+
# virtual_memory().available / 8: 64bits float
|
91 |
+
require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8
|
92 |
+
if require_ram > virtual_memory().available:
|
93 |
+
logger.warning("virtual_memory().available: %s", virtual_memory().available)
|
94 |
+
logger.warning("memory required: %s", require_ram)
|
95 |
+
|
96 |
+
if require_ram > virtual_memory().available * 10:
|
97 |
+
logger.warning("You're likely to encounter memory problem, such as slowing down response and/or OOM.")
|
98 |
+
|
99 |
+
# return dt1.doc(dt2.T)
|
100 |
+
return dt2.toarray().dot(dt1.toarray().T)
|
pyrightconfig.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"include": ["tests", "gradiobee"],
|
3 |
+
"venvPath": ".venv/Scripts",
|
4 |
+
"reportTypeshedErrors": false,
|
5 |
+
"reportMissingImports": true,
|
6 |
+
"reportMissingTypeStubs": false,
|
7 |
+
|
8 |
+
"pythonVersion": "3.7",
|
9 |
+
|
10 |
+
"ignore": []
|
11 |
+
}
|
requirements.txt
CHANGED
@@ -2,4 +2,10 @@ chardet
|
|
2 |
certifi
|
3 |
charset-normalizer
|
4 |
idna
|
5 |
-
typing-extensions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
certifi
|
3 |
charset-normalizer
|
4 |
idna
|
5 |
+
typing-extensions
|
6 |
+
sklearn
|
7 |
+
textacy
|
8 |
+
logzero
|
9 |
+
more_itertools
|
10 |
+
psutil
|
11 |
+
seaborn
|