Spaces:
Sleeping
Sleeping
Daniel Varga
commited on
Commit
·
e6e7ab0
1
Parent(s):
ae27165
PhotoLibrary. create_embeddings.py refactor, intermediate save.
Browse files- app.ini +2 -1
- create_embeddings.py +21 -12
- create_embeddings.sh +1 -1
- readme.sh +27 -0
app.ini
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
[DEFAULT]
|
2 |
-
pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
|
|
3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
|
|
1 |
[DEFAULT]
|
2 |
+
# pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
3 |
+
pkl = PhotoLibrary.854G.deduped.f16.pkl
|
4 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
create_embeddings.py
CHANGED
@@ -31,11 +31,27 @@ assert output_filename.endswith("pkl"), "first argument is the output pickle"
|
|
31 |
assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
|
32 |
do_thumbs = sys.argv[2] == "thumbs"
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
embeddings = []
|
35 |
filenames = []
|
36 |
thumbs = []
|
37 |
print("starting processing")
|
38 |
batch = []
|
|
|
39 |
for filename in sys.stdin:
|
40 |
filename = filename.rstrip()
|
41 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
@@ -43,10 +59,13 @@ for filename in sys.stdin:
|
|
43 |
rgb = Image.open(filename).convert("RGB")
|
44 |
img = preprocess(rgb)
|
45 |
batch.append(img)
|
|
|
46 |
if len(batch) >= batch_size:
|
47 |
do_batch(batch, embeddings)
|
48 |
batch = []
|
49 |
-
|
|
|
|
|
50 |
if do_thumbs:
|
51 |
rgb.thumbnail((128, 128))
|
52 |
thumb = np.array(rgb)
|
@@ -63,14 +82,4 @@ for filename in sys.stdin:
|
|
63 |
if len(batch) > 0:
|
64 |
do_batch(batch, embeddings)
|
65 |
|
66 |
-
embeddings
|
67 |
-
assert len(embeddings) == len(filenames)
|
68 |
-
print(f"processed {len(embeddings)} images")
|
69 |
-
|
70 |
-
data = {"embeddings": embeddings, "filenames": filenames}
|
71 |
-
if do_thumbs:
|
72 |
-
assert len(embeddings) == len(thumbs)
|
73 |
-
data["thumbs"] = thumbs
|
74 |
-
|
75 |
-
with open(output_filename, "wb") as f:
|
76 |
-
pickle.dump(data, f)
|
|
|
31 |
assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
|
32 |
do_thumbs = sys.argv[2] == "thumbs"
|
33 |
|
34 |
+
|
35 |
+
def save(output_filename, embeddings, filenames):
|
36 |
+
embeddings = np.array(embeddings)
|
37 |
+
assert len(embeddings) == len(filenames)
|
38 |
+
print(f"processed {len(embeddings)} images")
|
39 |
+
|
40 |
+
data = {"embeddings": embeddings, "filenames": filenames}
|
41 |
+
if do_thumbs:
|
42 |
+
assert len(embeddings) == len(thumbs)
|
43 |
+
data["thumbs"] = thumbs
|
44 |
+
|
45 |
+
with open(output_filename, "wb") as f:
|
46 |
+
pickle.dump(data, f)
|
47 |
+
|
48 |
+
|
49 |
embeddings = []
|
50 |
filenames = []
|
51 |
thumbs = []
|
52 |
print("starting processing")
|
53 |
batch = []
|
54 |
+
batch_count = 0
|
55 |
for filename in sys.stdin:
|
56 |
filename = filename.rstrip()
|
57 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
|
|
59 |
rgb = Image.open(filename).convert("RGB")
|
60 |
img = preprocess(rgb)
|
61 |
batch.append(img)
|
62 |
+
filenames.append(filename)
|
63 |
if len(batch) >= batch_size:
|
64 |
do_batch(batch, embeddings)
|
65 |
batch = []
|
66 |
+
batch_count += 1
|
67 |
+
if batch_count % 200 == 0:
|
68 |
+
save(output_filename, embeddings, filenames)
|
69 |
if do_thumbs:
|
70 |
rgb.thumbnail((128, 128))
|
71 |
thumb = np.array(rgb)
|
|
|
82 |
if len(batch) > 0:
|
83 |
do_batch(batch, embeddings)
|
84 |
|
85 |
+
save(output_filename, embeddings, filenames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
create_embeddings.sh
CHANGED
@@ -1 +1 @@
|
|
1 |
-
cat
|
|
|
1 |
+
cat PhotoLibrary.854G.deduped_md5sums | cut -f3- -d' ' | python ~/experiments/kalman/se/create_embeddings.py PhotoLibrary.854G.deduped.pkl no-thumbs
|
readme.sh
CHANGED
@@ -68,3 +68,30 @@ python convert.py 02_LOCATION_PHOTOS.pkl
|
|
68 |
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
69 |
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
70 |
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
69 |
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
70 |
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|
71 |
+
|
72 |
+
|
73 |
+
# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
|
74 |
+
ssh buda
|
75 |
+
cd /data/daniel/sameenergy
|
76 |
+
nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
|
77 |
+
# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.
|
78 |
+
|
79 |
+
|
80 |
+
nohup bash hashes.sh > md5.cout 2> md5.cerr &
|
81 |
+
# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
|
82 |
+
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc
|
83 |
+
|
84 |
+
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums
|
85 |
+
|
86 |
+
|
87 |
+
cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums
|
88 |
+
|
89 |
+
wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
|
90 |
+
591500 02_and_PhotoLibrary.854G.deduped_md5sums
|
91 |
+
514706 PhotoLibrary.854G.deduped_md5sums
|
92 |
+
# -> is not worth the hassle merging them. let's just do PhotoLibrary.
|
93 |
+
|
94 |
+
# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
|
95 |
+
# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums
|
96 |
+
|
97 |
+
# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.
|