Spaces:
Sleeping
Sleeping
Daniel Varga
commited on
Commit
·
8424a77
1
Parent(s):
67d87f5
create embedding, deduplication, docs
Browse files- app.ini +1 -1
- app.py +1 -1
- create_embeddings.py +20 -13
- create_embeddings.sh +1 -0
- dedupe.py +41 -0
- hashes.sh +1 -0
- readme.sh +37 -0
app.ini
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
[DEFAULT]
|
2 |
-
pkl =
|
3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
|
|
1 |
[DEFAULT]
|
2 |
+
pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
app.py
CHANGED
@@ -174,4 +174,4 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
|
|
174 |
|
175 |
|
176 |
if __name__ == "__main__":
|
177 |
-
demo.launch()
|
|
|
174 |
|
175 |
|
176 |
if __name__ == "__main__":
|
177 |
+
demo.launch(share=False)
|
create_embeddings.py
CHANGED
@@ -13,6 +13,7 @@ def do_batch(batch, embeddings):
|
|
13 |
image_features = model.encode_image(image_batch).float()
|
14 |
embeddings += image_features.cpu().numpy().tolist()
|
15 |
print(f"{len(embeddings)} done")
|
|
|
16 |
|
17 |
|
18 |
# even though it's not worth bothering with cuda,
|
@@ -38,19 +39,25 @@ batch = []
|
|
38 |
for filename in sys.stdin:
|
39 |
filename = filename.rstrip()
|
40 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# remaining
|
56 |
if len(batch) > 0:
|
|
|
13 |
image_features = model.encode_image(image_batch).float()
|
14 |
embeddings += image_features.cpu().numpy().tolist()
|
15 |
print(f"{len(embeddings)} done")
|
16 |
+
sys.stdout.flush()
|
17 |
|
18 |
|
19 |
# even though it's not worth bothering with cuda,
|
|
|
39 |
for filename in sys.stdin:
|
40 |
filename = filename.rstrip()
|
41 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
42 |
+
try:
|
43 |
+
rgb = Image.open(filename).convert("RGB")
|
44 |
+
img = preprocess(rgb)
|
45 |
+
batch.append(img)
|
46 |
+
if len(batch) >= batch_size:
|
47 |
+
do_batch(batch, embeddings)
|
48 |
+
batch = []
|
49 |
+
filenames.append(filename)
|
50 |
+
if do_thumbs:
|
51 |
+
rgb.thumbnail((128, 128))
|
52 |
+
thumb = np.array(rgb)
|
53 |
+
thumbs.append(thumb)
|
54 |
+
if len(filenames) >= limit:
|
55 |
+
break
|
56 |
+
except KeyboardInterrupt:
|
57 |
+
raise
|
58 |
+
except:
|
59 |
+
print(f"ERROR, skipping {filename}")
|
60 |
+
sys.stdout.flush()
|
61 |
|
62 |
# remaining
|
63 |
if len(batch) > 0:
|
create_embeddings.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
|
dedupe.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import pickle
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]
|
9 |
+
|
10 |
+
hashes = {}
|
11 |
+
for line in open(md5_filename, "r"):
|
12 |
+
line = line.strip("\n")
|
13 |
+
md5 = line.split()[0]
|
14 |
+
assert len(md5) == 32
|
15 |
+
assert line[32:34] == " "
|
16 |
+
filename = line[34:]
|
17 |
+
hashes[filename] = md5
|
18 |
+
print(len(hashes), "hashes read")
|
19 |
+
|
20 |
+
data = pickle.load(open(pickle_filename, "rb"))
|
21 |
+
print(len(data["embeddings"]), "embeddings read")
|
22 |
+
|
23 |
+
filenames = data["filenames"]
|
24 |
+
collected_indices = []
|
25 |
+
collected_md5s = set()
|
26 |
+
for i in range(len(filenames)):
|
27 |
+
filename = filenames[i]
|
28 |
+
md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
|
29 |
+
if md5 not in collected_md5s:
|
30 |
+
collected_indices.append(i)
|
31 |
+
collected_md5s.add(md5)
|
32 |
+
|
33 |
+
print(len(collected_indices), "unique hashes")
|
34 |
+
filenames = np.array(filenames)
|
35 |
+
|
36 |
+
data["filenames"] = filenames[collected_indices].tolist()
|
37 |
+
data["embeddings"] = data["embeddings"][collected_indices]
|
38 |
+
assert "thumbs" not in data
|
39 |
+
|
40 |
+
with open(output_pickle_filename, "wb") as f:
|
41 |
+
pickle.dump(data, f)
|
hashes.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
cat jpg_files | while read f ; do md5sum "$f" ; done > md5sums
|
readme.sh
CHANGED
@@ -22,3 +22,40 @@ python app.py
|
|
22 |
|
23 |
python convert.py sample_fbi_s1e1.pkl
|
24 |
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
python convert.py sample_fbi_s1e1.pkl
|
24 |
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
|
25 |
+
|
26 |
+
|
27 |
+
ssh -p 2820 hexagon.renyi.hu
|
28 |
+
cd ai-shared/daniel/sameenergy
|
29 |
+
lftp -p 2167 gw.pioneer.hu
|
30 |
+
# manually provide username, password
|
31 |
+
cd store/05_Photos
|
32 |
+
# promising directories:
|
33 |
+
ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
|
34 |
+
mirror 02_LOCATION\ PHOTOS
|
35 |
+
ctrl-z
|
36 |
+
# -> puts mirroring to background.
|
37 |
+
ctrl-d
|
38 |
+
# -> exits lftp without terminating the background job, making it nohup.
|
39 |
+
|
40 |
+
# scp'd files to buda
|
41 |
+
cd /data/daniel/sameenergy/
|
42 |
+
find 02_LOCATION_PHOTOS -type f > raw_files
|
43 |
+
cat raw_files | grep -i "jpg\|jpeg$" > jpg_files
|
44 |
+
|
45 |
+
# TODO
|
46 |
+
# chmod files on ai-shared
|
47 |
+
|
48 |
+
nohup bash create_embeddings.sh &
|
49 |
+
# ...but it's really just this:
|
50 |
+
# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
|
51 |
+
# -> after some 8 hours or so processes cca 200k images, resulting in
|
52 |
+
# 02_LOCATION_PHOTOS.pkl
|
53 |
+
|
54 |
+
# hashes for deduplication:
|
55 |
+
bash hashes.sh
|
56 |
+
# takes jpg_files and outputs md5sums
|
57 |
+
|
58 |
+
python convert.py 02_LOCATION_PHOTOS.pkl
|
59 |
+
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
60 |
+
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
61 |
+
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|