se / readme.sh
Daniel Varga
switching to downscaled PhotoLibrary from downscaled 02_LOCATION_PHOTOS
9de5f50
raw
history blame
4.93 kB
# lots of images moved to directory sample_fbi_s1e1
# list them:
find sample_fbi_s1e1 | grep "jpg\|JPG\|jpeg$" > sample_fbi_s1e1.txt
# copy them to public:
scp -q -r -P 2820 sample_fbi_s1e1 hexagon.renyi.hu:./ai-shared/daniel/sameenergy/
# example URL:
# https://static.renyi.hu/ai-shared/daniel/sameenergy/sample_fbi_s1e1/x_BRIDGE_ADRIATIC/Dobogoko_Esztergom/Videk_ut_Dobogoko_Esztergom_014.jpg
# run CLIP:
time cat sample_fbi_s1e1.txt | python create_embeddings.py sample_fbi_s1e1.pkl no-thumbs
# -> sample_fbi_s1e1.pkl contains embeddings and filenames.
# some 12 images/sec on CPU.
# gradio app:
python app.py --pkl sample_fbi_s1e1.pkl --url https://static.renyi.hu/ai-shared/daniel/sameenergy/
# or
python app.py
# ...and then it takes these from app.ini
python convert.py sample_fbi_s1e1.pkl
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
ssh -p 2820 hexagon.renyi.hu
cd ai-shared/daniel/sameenergy
lftp -p 2167 -u d.varga gw.pioneer.hu
# manually provide password
cd store/05_Photos
# promising directories:
ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
mirror 02_LOCATION\ PHOTOS
ctrl-z
# -> puts mirroring to background.
ctrl-d
# -> exits lftp without terminating the background job, making it nohup.
# scp'd files to buda
cd /data/daniel/sameenergy/
# how many bytes, as a check?
find 02_LOCATION_PHOTOS -type f -exec stat --format="%s" {} \; | awk '{total += $1} END {print total}'
# -> 141,133,402,112 that's 141GB. in 197108 files, not including directories.
# on the Pioneer server this was 141,131,778,304 bytes in 196916 files, 6446 directories, good enough.
find 02_LOCATION_PHOTOS -type f > raw_files
cat raw_files | grep -i "jpg\|jpeg$" > jpg_files
# TODO
# chmod files on ai-shared
nohup bash create_embeddings.sh &
# ...but it's really just this:
# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
# -> after some 8 hours or so processes cca 200k images, resulting in
# 02_LOCATION_PHOTOS.pkl
# hashes for deduplication:
bash hashes.sh
# takes jpg_files and outputs md5sums
# ad hoc sample from duplicates:
cat md5sums | awk 'BEGIN{FS=" "} { if ($1 in m) { print $1 "\t" $2 "\t" m[$1] } ; m[$1] = $2 }' | awk '(NR%4000==0)' | cut -f2-
python convert.py 02_LOCATION_PHOTOS.pkl
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
ssh buda
cd /data/daniel/sameenergy
nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.
nohup bash hashes.sh > md5.cout 2> md5.cerr &
# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums
cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums
wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
591500 02_and_PhotoLibrary.854G.deduped_md5sums
514706 PhotoLibrary.854G.deduped_md5sums
# -> is not worth the hassle merging them. let's just do PhotoLibrary.
# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums
# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.
#####
# thumbnailing
# on hexagon
cd ~/ai-shared/daniel/sameenergy
nohup cp -r 02_LOCATION_PHOTOS 02_LOCATION_PHOTOS.thumbs &
nohup cp -r PhotoLibrary PhotoLibrary.thumbs &
# -> this is slooooow, a day or so.
# the following code, located at hexagon:~/ai-shared/daniel/sameenergy/downscale.sh ,
# downscales so that the image fits into 1024x1024
find $root -type f | grep -i "jpeg\|jpg$" | while read f ; do echo "$f" ; convert "$f" -resize "1024x1024>" "$f" ; done
# it was run like this, setting root=02_LOCATION_PHOTOS.thumbs
nohup bash downscale.sh > 02_LOCATION_PHOTOS.downscale.cout 2> 02_LOCATION_PHOTOS.downscale.cerr &
# -> took a night or so.
nohup bash downscale.sh > PhotoLibrary.downscale.cout 2> PhotoLibrary.downscale.cerr &
# -> took 2 days or so.
# added to app.py to patch the filenames in the pickle to change PhotoLibrary to PhotoLibrary.thumbs