Spaces:
Sleeping
Sleeping
# lots of images moved to directory sample_fbi_s1e1 | |
# list them: | |
find sample_fbi_s1e1 | grep "jpg\|JPG\|jpeg$" > sample_fbi_s1e1.txt | |
# copy them to public: | |
scp -q -r -P 2820 sample_fbi_s1e1 hexagon.renyi.hu:./ai-shared/daniel/sameenergy/ | |
# example URL: | |
# https://static.renyi.hu/ai-shared/daniel/sameenergy/sample_fbi_s1e1/x_BRIDGE_ADRIATIC/Dobogoko_Esztergom/Videk_ut_Dobogoko_Esztergom_014.jpg | |
# run CLIP: | |
time cat sample_fbi_s1e1.txt | python create_embeddings.py sample_fbi_s1e1.pkl no-thumbs | |
# -> sample_fbi_s1e1.pkl contains embeddings and filenames. | |
# some 12 images/sec on CPU. | |
# gradio app: | |
python app.py --pkl sample_fbi_s1e1.pkl --url https://static.renyi.hu/ai-shared/daniel/sameenergy/ | |
# or | |
python app.py | |
# ...and then it takes these from app.ini | |
python convert.py sample_fbi_s1e1.pkl | |
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16. | |
ssh -p 2820 hexagon.renyi.hu | |
cd ai-shared/daniel/sameenergy | |
lftp -p 2167 -u d.varga gw.pioneer.hu | |
# manually provide password | |
cd store/05_Photos | |
# promising directories: | |
ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde | |
mirror 02_LOCATION\ PHOTOS | |
ctrl-z | |
# -> puts mirroring to background. | |
ctrl-d | |
# -> exits lftp without terminating the background job, making it nohup. | |
# scp'd files to buda | |
cd /data/daniel/sameenergy/ | |
# how many bytes, as a check? | |
find 02_LOCATION_PHOTOS -type f -exec stat --format="%s" {} \; | awk '{total += $1} END {print total}' | |
# -> 141,133,402,112 that's 141GB. in 197108 files, not including directories. | |
# on the Pioneer server this was 141,131,778,304 bytes in 196916 files, 6446 directories, good enough. | |
find 02_LOCATION_PHOTOS -type f > raw_files | |
cat raw_files | grep -i "jpg\|jpeg$" > jpg_files | |
# TODO | |
# chmod files on ai-shared | |
nohup bash create_embeddings.sh & | |
# ...but it's really just this: | |
# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs | |
# -> after some 8 hours or so processes cca 200k images, resulting in | |
# 02_LOCATION_PHOTOS.pkl | |
# hashes for deduplication: | |
bash hashes.sh | |
# takes jpg_files and outputs md5sums | |
# ad hoc sample from duplicates: | |
cat md5sums | awk 'BEGIN{FS=" "} { if ($1 in m) { print $1 "\t" $2 "\t" m[$1] } ; m[$1] = $2 }' | awk '(NR%4000==0)' | cut -f2- | |
python convert.py 02_LOCATION_PHOTOS.pkl | |
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl | |
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums | |
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl | |
# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting. | |
ssh buda | |
cd /data/daniel/sameenergy | |
nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . & | |
# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing. | |
nohup bash hashes.sh > md5.cout 2> md5.cerr & | |
# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs. | |
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc | |
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums | |
cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums | |
wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums | |
591500 02_and_PhotoLibrary.854G.deduped_md5sums | |
514706 PhotoLibrary.854G.deduped_md5sums | |
# -> is not worth the hassle merging them. let's just do PhotoLibrary. | |
# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that, | |
# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums | |
# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived. | |
##### | |
# thumbnailing | |
# on hexagon | |
cd ~/ai-shared/daniel/sameenergy | |
nohup cp -r 02_LOCATION_PHOTOS 02_LOCATION_PHOTOS.thumbs & | |
nohup cp -r PhotoLibrary PhotoLibrary.thumbs & | |
# -> this is slooooow, a day or so. | |
# the following code, located at hexagon:~/ai-shared/daniel/sameenergy/downscale.sh , | |
# downscales so that the image fits into 1024x1024 | |
find $root -type f | grep -i "jpeg\|jpg$" | while read f ; do echo "$f" ; convert "$f" -resize "1024x1024>" "$f" ; done | |
# it was run like this, setting root=02_LOCATION_PHOTOS.thumbs | |
nohup bash downscale.sh > 02_LOCATION_PHOTOS.downscale.cout 2> 02_LOCATION_PHOTOS.downscale.cerr & | |
# -> took a night or so. | |
nohup bash downscale.sh > PhotoLibrary.downscale.cout 2> PhotoLibrary.downscale.cerr & | |
# -> took 2 days or so. | |
# added to app.py to patch the filenames in the pickle to change PhotoLibrary to PhotoLibrary.thumbs | |