Daniel Varga commited on
Commit
8424a77
·
1 Parent(s): 67d87f5

create embedding, deduplication, docs

Browse files
Files changed (7) hide show
  1. app.ini +1 -1
  2. app.py +1 -1
  3. create_embeddings.py +20 -13
  4. create_embeddings.sh +1 -0
  5. dedupe.py +41 -0
  6. hashes.sh +1 -0
  7. readme.sh +37 -0
app.ini CHANGED
@@ -1,3 +1,3 @@
1
  [DEFAULT]
2
- pkl = sample_fbi_s1e1.f16.pkl
3
  url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
 
1
  [DEFAULT]
2
+ pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
3
  url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
app.py CHANGED
@@ -174,4 +174,4 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
174
 
175
 
176
  if __name__ == "__main__":
177
- demo.launch()
 
174
 
175
 
176
  if __name__ == "__main__":
177
+ demo.launch(share=False)
create_embeddings.py CHANGED
@@ -13,6 +13,7 @@ def do_batch(batch, embeddings):
13
  image_features = model.encode_image(image_batch).float()
14
  embeddings += image_features.cpu().numpy().tolist()
15
  print(f"{len(embeddings)} done")
 
16
 
17
 
18
  # even though it's not worth bothering with cuda,
@@ -38,19 +39,25 @@ batch = []
38
  for filename in sys.stdin:
39
  filename = filename.rstrip()
40
  if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
41
- rgb = Image.open(filename).convert("RGB")
42
- img = preprocess(rgb)
43
- batch.append(img)
44
- if len(batch) >= batch_size:
45
- do_batch(batch, embeddings)
46
- batch = []
47
- filenames.append(filename)
48
- if do_thumbs:
49
- rgb.thumbnail((128, 128))
50
- thumb = np.array(rgb)
51
- thumbs.append(thumb)
52
- if len(filenames) >= limit:
53
- break
 
 
 
 
 
 
54
 
55
  # remaining
56
  if len(batch) > 0:
 
13
  image_features = model.encode_image(image_batch).float()
14
  embeddings += image_features.cpu().numpy().tolist()
15
  print(f"{len(embeddings)} done")
16
+ sys.stdout.flush()
17
 
18
 
19
  # even though it's not worth bothering with cuda,
 
39
  for filename in sys.stdin:
40
  filename = filename.rstrip()
41
  if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
42
+ try:
43
+ rgb = Image.open(filename).convert("RGB")
44
+ img = preprocess(rgb)
45
+ batch.append(img)
46
+ if len(batch) >= batch_size:
47
+ do_batch(batch, embeddings)
48
+ batch = []
49
+ filenames.append(filename)
50
+ if do_thumbs:
51
+ rgb.thumbnail((128, 128))
52
+ thumb = np.array(rgb)
53
+ thumbs.append(thumb)
54
+ if len(filenames) >= limit:
55
+ break
56
+ except KeyboardInterrupt:
57
+ raise
58
+ except:
59
+ print(f"ERROR, skipping {filename}")
60
+ sys.stdout.flush()
61
 
62
  # remaining
63
  if len(batch) > 0:
create_embeddings.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
dedupe.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.
2
+
3
+ import sys
4
+ import pickle
5
+ import numpy as np
6
+
7
+
8
+ pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]
9
+
10
+ hashes = {}
11
+ for line in open(md5_filename, "r"):
12
+ line = line.strip("\n")
13
+ md5 = line.split()[0]
14
+ assert len(md5) == 32
15
+ assert line[32:34] == " "
16
+ filename = line[34:]
17
+ hashes[filename] = md5
18
+ print(len(hashes), "hashes read")
19
+
20
+ data = pickle.load(open(pickle_filename, "rb"))
21
+ print(len(data["embeddings"]), "embeddings read")
22
+
23
+ filenames = data["filenames"]
24
+ collected_indices = []
25
+ collected_md5s = set()
26
+ for i in range(len(filenames)):
27
+ filename = filenames[i]
28
+ md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
29
+ if md5 not in collected_md5s:
30
+ collected_indices.append(i)
31
+ collected_md5s.add(md5)
32
+
33
+ print(len(collected_indices), "unique hashes")
34
+ filenames = np.array(filenames)
35
+
36
+ data["filenames"] = filenames[collected_indices].tolist()
37
+ data["embeddings"] = data["embeddings"][collected_indices]
38
+ assert "thumbs" not in data
39
+
40
+ with open(output_pickle_filename, "wb") as f:
41
+ pickle.dump(data, f)
hashes.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ cat jpg_files | while read f ; do md5sum "$f" ; done > md5sums
readme.sh CHANGED
@@ -22,3 +22,40 @@ python app.py
22
 
23
  python convert.py sample_fbi_s1e1.pkl
24
  # -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  python convert.py sample_fbi_s1e1.pkl
24
  # -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
25
+
26
+
27
+ ssh -p 2820 hexagon.renyi.hu
28
+ cd ai-shared/daniel/sameenergy
29
+ lftp -p 2167 gw.pioneer.hu
30
+ # manually provide username, password
31
+ cd store/05_Photos
32
+ # promising directories:
33
+ ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
34
+ mirror 02_LOCATION\ PHOTOS
35
+ ctrl-z
36
+ # -> puts mirroring to background.
37
+ ctrl-d
38
+ # -> exits lftp without terminating the background job, making it nohup.
39
+
40
+ # scp'd files to buda
41
+ cd /data/daniel/sameenergy/
42
+ find 02_LOCATION_PHOTOS -type f > raw_files
43
+ cat raw_files | grep -i "jpg\|jpeg$" > jpg_files
44
+
45
+ # TODO
46
+ # chmod files on ai-shared
47
+
48
+ nohup bash create_embeddings.sh &
49
+ # ...but it's really just this:
50
+ # cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
51
+ # -> after some 8 hours or so processes cca 200k images, resulting in
52
+ # 02_LOCATION_PHOTOS.pkl
53
+
54
+ # hashes for deduplication:
55
+ bash hashes.sh
56
+ # takes jpg_files and outputs md5sums
57
+
58
+ python convert.py 02_LOCATION_PHOTOS.pkl
59
+ # -> creates float16 02_LOCATION_PHOTOS.f16.pkl
60
+ mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
61
+ python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl