Spaces:

danielvarga
/

se

Sleeping

se / readme.sh

Daniel Varga

switching to downscaled PhotoLibrary from downscaled 02_LOCATION_PHOTOS

9de5f50 about 2 years ago

4.93 kB


	# lots of images moved to directory sample_fbi_s1e1

	# list them:
	find sample_fbi_s1e1 \| grep "jpg\\|JPG\\|jpeg$" > sample_fbi_s1e1.txt

	# copy them to public:
	scp -q -r -P 2820 sample_fbi_s1e1 hexagon.renyi.hu:./ai-shared/daniel/sameenergy/
	# example URL:
	# https://static.renyi.hu/ai-shared/daniel/sameenergy/sample_fbi_s1e1/x_BRIDGE_ADRIATIC/Dobogoko_Esztergom/Videk_ut_Dobogoko_Esztergom_014.jpg

	# run CLIP:
	time cat sample_fbi_s1e1.txt \| python create_embeddings.py sample_fbi_s1e1.pkl no-thumbs
	# -> sample_fbi_s1e1.pkl contains embeddings and filenames.
	# some 12 images/sec on CPU.

	# gradio app:
	python app.py --pkl sample_fbi_s1e1.pkl --url https://static.renyi.hu/ai-shared/daniel/sameenergy/
	# or
	python app.py
	# ...and then it takes these from app.ini

	python convert.py sample_fbi_s1e1.pkl
	# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.


	ssh -p 2820 hexagon.renyi.hu
	cd ai-shared/daniel/sameenergy
	lftp -p 2167 -u d.varga gw.pioneer.hu
	# manually provide password
	cd store/05_Photos
	# promising directories:
	ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
	mirror 02_LOCATION\ PHOTOS
	ctrl-z
	# -> puts mirroring to background.
	ctrl-d
	# -> exits lftp without terminating the background job, making it nohup.

	# scp'd files to buda
	cd /data/daniel/sameenergy/

	# how many bytes, as a check?
	find 02_LOCATION_PHOTOS -type f -exec stat --format="%s" {} \; \| awk '{total += $1} END {print total}'
	# -> 141,133,402,112 that's 141GB. in 197108 files, not including directories.
	# on the Pioneer server this was 141,131,778,304 bytes in 196916 files, 6446 directories, good enough.

	find 02_LOCATION_PHOTOS -type f > raw_files
	cat raw_files \| grep -i "jpg\\|jpeg$" > jpg_files

	# TODO
	# chmod files on ai-shared

	nohup bash create_embeddings.sh &
	# ...but it's really just this:
	# cat jpg_files \| python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
	# -> after some 8 hours or so processes cca 200k images, resulting in
	# 02_LOCATION_PHOTOS.pkl

	# hashes for deduplication:
	bash hashes.sh
	# takes jpg_files and outputs md5sums

	# ad hoc sample from duplicates:
	cat md5sums \| awk 'BEGIN{FS=" "} { if ($1 in m) { print $1 "\t" $2 "\t" m[$1] } ; m[$1] = $2 }' \| awk '(NR%4000==0)' \| cut -f2-

	python convert.py 02_LOCATION_PHOTOS.pkl
	# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
	mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
	python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl


	# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
	ssh buda
	cd /data/daniel/sameenergy
	nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
	# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.


	nohup bash hashes.sh > md5.cout 2> md5.cerr &
	# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
	cat PhotoLibrary.854G.md5sums \| awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' \| grep -v "^skipping" \| wc

	cat PhotoLibrary.854G.md5sums \| awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' \| grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums


	cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums \| awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' \| grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums

	wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
	591500 02_and_PhotoLibrary.854G.deduped_md5sums
	514706 PhotoLibrary.854G.deduped_md5sums
	# -> is not worth the hassle merging them. let's just do PhotoLibrary.

	# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
	# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums

	# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.



	#####
	# thumbnailing

	# on hexagon
	cd ~/ai-shared/daniel/sameenergy
	nohup cp -r 02_LOCATION_PHOTOS 02_LOCATION_PHOTOS.thumbs &
	nohup cp -r PhotoLibrary PhotoLibrary.thumbs &
	# -> this is slooooow, a day or so.

	# the following code, located at hexagon:~/ai-shared/daniel/sameenergy/downscale.sh ,
	# downscales so that the image fits into 1024x1024
	find $root -type f \| grep -i "jpeg\\|jpg$" \| while read f ; do echo "$f" ; convert "$f" -resize "1024x1024>" "$f" ; done

	# it was run like this, setting root=02_LOCATION_PHOTOS.thumbs
	nohup bash downscale.sh > 02_LOCATION_PHOTOS.downscale.cout 2> 02_LOCATION_PHOTOS.downscale.cerr &
	# -> took a night or so.
	nohup bash downscale.sh > PhotoLibrary.downscale.cout 2> PhotoLibrary.downscale.cerr &
	# -> took 2 days or so.

	# added to app.py to patch the filenames in the pickle to change PhotoLibrary to PhotoLibrary.thumbs