Daniel Varga commited on
Commit
e6e7ab0
·
1 Parent(s): ae27165

PhotoLibrary. create_embeddings.py refactor, intermediate save.

Browse files
Files changed (4) hide show
  1. app.ini +2 -1
  2. create_embeddings.py +21 -12
  3. create_embeddings.sh +1 -1
  4. readme.sh +27 -0
app.ini CHANGED
@@ -1,3 +1,4 @@
1
  [DEFAULT]
2
- pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
 
3
  url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
 
1
  [DEFAULT]
2
+ # pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
3
+ pkl = PhotoLibrary.854G.deduped.f16.pkl
4
  url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
create_embeddings.py CHANGED
@@ -31,11 +31,27 @@ assert output_filename.endswith("pkl"), "first argument is the output pickle"
31
  assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
32
  do_thumbs = sys.argv[2] == "thumbs"
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  embeddings = []
35
  filenames = []
36
  thumbs = []
37
  print("starting processing")
38
  batch = []
 
39
  for filename in sys.stdin:
40
  filename = filename.rstrip()
41
  if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
@@ -43,10 +59,13 @@ for filename in sys.stdin:
43
  rgb = Image.open(filename).convert("RGB")
44
  img = preprocess(rgb)
45
  batch.append(img)
 
46
  if len(batch) >= batch_size:
47
  do_batch(batch, embeddings)
48
  batch = []
49
- filenames.append(filename)
 
 
50
  if do_thumbs:
51
  rgb.thumbnail((128, 128))
52
  thumb = np.array(rgb)
@@ -63,14 +82,4 @@ for filename in sys.stdin:
63
  if len(batch) > 0:
64
  do_batch(batch, embeddings)
65
 
66
- embeddings = np.array(embeddings)
67
- assert len(embeddings) == len(filenames)
68
- print(f"processed {len(embeddings)} images")
69
-
70
- data = {"embeddings": embeddings, "filenames": filenames}
71
- if do_thumbs:
72
- assert len(embeddings) == len(thumbs)
73
- data["thumbs"] = thumbs
74
-
75
- with open(output_filename, "wb") as f:
76
- pickle.dump(data, f)
 
31
  assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
32
  do_thumbs = sys.argv[2] == "thumbs"
33
 
34
+
35
+ def save(output_filename, embeddings, filenames):
36
+ embeddings = np.array(embeddings)
37
+ assert len(embeddings) == len(filenames)
38
+ print(f"processed {len(embeddings)} images")
39
+
40
+ data = {"embeddings": embeddings, "filenames": filenames}
41
+ if do_thumbs:
42
+ assert len(embeddings) == len(thumbs)
43
+ data["thumbs"] = thumbs
44
+
45
+ with open(output_filename, "wb") as f:
46
+ pickle.dump(data, f)
47
+
48
+
49
  embeddings = []
50
  filenames = []
51
  thumbs = []
52
  print("starting processing")
53
  batch = []
54
+ batch_count = 0
55
  for filename in sys.stdin:
56
  filename = filename.rstrip()
57
  if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
 
59
  rgb = Image.open(filename).convert("RGB")
60
  img = preprocess(rgb)
61
  batch.append(img)
62
+ filenames.append(filename)
63
  if len(batch) >= batch_size:
64
  do_batch(batch, embeddings)
65
  batch = []
66
+ batch_count += 1
67
+ if batch_count % 200 == 0:
68
+ save(output_filename, embeddings, filenames)
69
  if do_thumbs:
70
  rgb.thumbnail((128, 128))
71
  thumb = np.array(rgb)
 
82
  if len(batch) > 0:
83
  do_batch(batch, embeddings)
84
 
85
+ save(output_filename, embeddings, filenames)
 
 
 
 
 
 
 
 
 
 
create_embeddings.sh CHANGED
@@ -1 +1 @@
1
- cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
 
1
+ cat PhotoLibrary.854G.deduped_md5sums | cut -f3- -d' ' | python ~/experiments/kalman/se/create_embeddings.py PhotoLibrary.854G.deduped.pkl no-thumbs
readme.sh CHANGED
@@ -68,3 +68,30 @@ python convert.py 02_LOCATION_PHOTOS.pkl
68
  # -> creates float16 02_LOCATION_PHOTOS.f16.pkl
69
  mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
70
  python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # -> creates float16 02_LOCATION_PHOTOS.f16.pkl
69
  mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
70
  python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
71
+
72
+
73
+ # started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
74
+ ssh buda
75
+ cd /data/daniel/sameenergy
76
+ nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
77
+ # 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.
78
+
79
+
80
+ nohup bash hashes.sh > md5.cout 2> md5.cerr &
81
+ # -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
82
+ cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc
83
+
84
+ cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums
85
+
86
+
87
+ cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums
88
+
89
+ wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
90
+ 591500 02_and_PhotoLibrary.854G.deduped_md5sums
91
+ 514706 PhotoLibrary.854G.deduped_md5sums
92
+ # -> is not worth the hassle merging them. let's just do PhotoLibrary.
93
+
94
+ # rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
95
+ # doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums
96
+
97
+ # TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.