Spaces:

danielvarga
/

se

Sleeping

Daniel Varga commited on May 8, 2023

Commit

8424a77

1 Parent(s): 67d87f5

create embedding, deduplication, docs

Files changed (7) hide show

app.ini CHANGED Viewed

@@ -1,3 +1,3 @@
 [DEFAULT]
-pkl = sample_fbi_s1e1.f16.pkl
 url = https://static.renyi.hu/ai-shared/daniel/sameenergy/

 [DEFAULT]
+pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
 url = https://static.renyi.hu/ai-shared/daniel/sameenergy/

app.py CHANGED Viewed

@@ -174,4 +174,4 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
 if __name__ == "__main__":
-    demo.launch()

create_embeddings.py CHANGED Viewed

@@ -13,6 +13,7 @@ def do_batch(batch, embeddings):
         image_features = model.encode_image(image_batch).float()
         embeddings += image_features.cpu().numpy().tolist()
         print(f"{len(embeddings)} done")
 # even though it's not worth bothering with cuda,
@@ -38,19 +39,25 @@ batch = []
 for filename in sys.stdin:
     filename = filename.rstrip()
     if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
-        rgb = Image.open(filename).convert("RGB")
-        img = preprocess(rgb)
-        batch.append(img)
-        if len(batch) >= batch_size:
-            do_batch(batch, embeddings)
-            batch = []
-        filenames.append(filename)
-        if do_thumbs:
-            rgb.thumbnail((128, 128))
-            thumb = np.array(rgb)
-            thumbs.append(thumb)
-        if len(filenames) >= limit:
-            break
 # remaining
 if len(batch) > 0:

         image_features = model.encode_image(image_batch).float()
         embeddings += image_features.cpu().numpy().tolist()
         print(f"{len(embeddings)} done")
+        sys.stdout.flush()
 # even though it's not worth bothering with cuda,
 for filename in sys.stdin:
     filename = filename.rstrip()
     if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
+        try:
+            rgb = Image.open(filename).convert("RGB")
+            img = preprocess(rgb)
+            batch.append(img)
+            if len(batch) >= batch_size:
+                do_batch(batch, embeddings)
+                batch = []
+            filenames.append(filename)
+            if do_thumbs:
+                rgb.thumbnail((128, 128))
+                thumb = np.array(rgb)
+                thumbs.append(thumb)
+            if len(filenames) >= limit:
+                break
+        except KeyboardInterrupt:
+            raise
+        except:
+            print(f"ERROR, skipping {filename}")
+            sys.stdout.flush()
 # remaining
 if len(batch) > 0:

create_embeddings.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ cat jpg_files \| python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs

dedupe.py ADDED Viewed

+# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.
+import sys
+import pickle
+import numpy as np
+pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]
+hashes = {}
+for line in open(md5_filename, "r"):
+    line = line.strip("\n")
+    md5 = line.split()[0]
+    assert len(md5) == 32
+    assert line[32:34] == "  "
+    filename = line[34:]
+    hashes[filename] = md5
+print(len(hashes), "hashes read")
+data = pickle.load(open(pickle_filename, "rb"))
+print(len(data["embeddings"]), "embeddings read")
+filenames = data["filenames"]
+collected_indices = []
+collected_md5s = set()
+for i in range(len(filenames)):
+    filename = filenames[i]
+    md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
+    if md5 not in collected_md5s:
+        collected_indices.append(i)
+        collected_md5s.add(md5)
+print(len(collected_indices), "unique hashes")
+filenames = np.array(filenames)
+data["filenames"] = filenames[collected_indices].tolist()
+data["embeddings"] = data["embeddings"][collected_indices]
+assert "thumbs" not in data
+with open(output_pickle_filename, "wb") as f:
+    pickle.dump(data, f)

hashes.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ cat jpg_files \| while read f ; do md5sum "$f" ; done > md5sums

readme.sh CHANGED Viewed

@@ -22,3 +22,40 @@ python app.py
 python convert.py sample_fbi_s1e1.pkl
 # -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.

 python convert.py sample_fbi_s1e1.pkl
 # -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
+ssh -p 2820 hexagon.renyi.hu
+cd ai-shared/daniel/sameenergy
+lftp -p 2167 gw.pioneer.hu
+# manually provide username, password
+cd store/05_Photos
+# promising directories:
+ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
+mirror 02_LOCATION\ PHOTOS
+ctrl-z
+# -> puts mirroring to background.
+ctrl-d
+# -> exits lftp without terminating the background job, making it nohup.
+# scp'd files to buda
+cd /data/daniel/sameenergy/
+find 02_LOCATION_PHOTOS -type f > raw_files
+cat raw_files | grep -i "jpg\|jpeg$" > jpg_files
+# TODO
+# chmod files on ai-shared
+nohup bash create_embeddings.sh &
+# ...but it's really just this:
+# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
+# -> after some 8 hours or so processes cca 200k images, resulting in
+# 02_LOCATION_PHOTOS.pkl
+# hashes for deduplication:
+bash hashes.sh
+# takes jpg_files and outputs md5sums
+python convert.py 02_LOCATION_PHOTOS.pkl
+# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
+mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
+python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl