MyVIP/Bash script
< MyVIP
Jump to navigation
Jump to search
Revision as of 11:49, 3 October 2015 by Bzc6p (talk | contribs) (Created page with "MyVIP archiving bash script, written by user:bzc6p. Needs to be rewritten to conform ArchiveTeam framework and standards. <pre> #!/bin/bash # Discovers and downloads user...")
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
MyVIP archiving bash script, written by user:bzc6p. Needs to be rewritten to conform ArchiveTeam framework and standards.
#!/bin/bash # Discovers and downloads user content belonging to given user ID # Accepts one or two paramters: a single id or two ids, in the latter case does the range. # Creates a WARC file with the profile content and a csv file with one line containing some identifying information about the user. # Avatar pictures' links are collected for future downloading. abort_wpull () # if wpull is redirected, content is wrong and therefore we shouldn't go on { echo "> Wpull grabbed wrong pages last time, you probably have lost authentication or something other weird happened. Check the logs before going on." echo "> Aborted." rm temp1 temp2 temp3 temp4 temp41 temp42 temp5 acq_list list db myvip_script_lock 2>/dev/null mkdir ERROR 2>/dev/null mv $2.warc.gz ERROR mv $2.csv ERROR mv log.txt ERROR/log_$1.txt rm avatars/*av_$1 unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS unset MYVIP_A MYVIP_B WPULL_OPTS WGET_OPTS } fetch () # In case wget has a DNS error (doesn't retry) or we've lost authentication. { while [ true ] do wget $WGET_OPTS -O $2 $1 || { echo "> Probably an error in the connection. Sleeping 1 minute..."; sleep 60; continue; } if [ `grep "<span class=\"btn-text\">Bejelentkezés</span>" $2 | wc -l | cut -d" " -f 1` -gt 0 ]; then echo "> You have lost your authentication! Log in and export your cookies file again!" echo "> Sleeping 1 minute..." sleep 60 echo "> Retrying..." continue fi break done } echo "*** myVIP user backup script ***" [[ $1 =~ `echo "^[0-9]+$"` ]] || { echo "> First parameter wrong!"; echo "> Aborted."; exit 1; } [[ -z $2 ]] || [[ $2 =~ `echo "^[0-9]+$"` ]] || { echo "> Second parameter wrong!"; echo "> Aborted."; exit 1; } [[ -z $2 ]] || [[ $1-$2 -le 0 ]] || { echo "> Parameters wrong!"; echo "> Aborted."; exit 1; } echo "> Looking for wpull..." wpull --version > wpull_ver 2>/dev/null || { echo "> You don't have wpull installed! wpull is necessary for the script to run!"; echo "> Aborted"; rm wpull_ver; exit 1; } [ `cat wpull_ver | cut -d"." -f 1` -lt 1 ] && { echo "> Your wpull version is too old (`cat wpull_ver`). The script needs at least wpull version 1.2 to run."; echo "> Aborted."; rm wpull_ver; exit 1; } [ `cat wpull_ver | cut -d"." -f 1` -eq 1 -a `cat wpull_ver | cut -d"." -f 2` -lt 2 ] && { echo "> Your wpull version is too old (`cat wpull_ver`). The script needs at least wpull version 1.2 to run."; echo "> Aborted."; rm wpull_ver; exit 1; } rm wpull_ver echo "> Checking authentication..." if [ `wget --load-cookies cookies.txt -q -O - http://myvip.com/profile.php | grep "Adatlap" | wc -l | cut -d" " -f 1` -lt 1 ]; then echo "> Authentication failed. Check your cookies file or your internet connection."; echo "> Aborted."; exit 1 fi cat myvip_script_lock >/dev/null 2>/dev/null && { echo "> Another myVIP backup script seems to be running! Multiple instances of the script MUST NOT be run at the same time!"; echo "> It is possible though that the last run interrupted. If you are sure no other myVIP backup script is running, issue 'rm myvip_script_lock' and retry."; echo "> Aborted."; exit 1; } touch myvip_script_lock mkdir avatars warcs logs index 2>/dev/null MYVIP_A=$1 if [[ -z $2 ]]; then MYVIP_B=$1 echo "> Backing up myVIP user profile $MYVIP_A" else MYVIP_B=$2 echo "> Backing up myVIP user profiles ${MYVIP_A}–${MYVIP_B}" fi WPULL_OPTS="--exclude-domains static.myvip.com,avatar.myvip.com --reject-regex infobar_frame|banner_bottombanner_frame -a log.txt --retry-connrefused --retry-dns-error --tries inf --waitretry 10 --timeout 30 --no-robots --progress none --load-cookies cookies.txt -p -H -Dmyvip.com --no-warc-keep-log --delete-after --database db --warc-append" # options for wpull WGET_OPTS="-q -a log.txt --retry-connrefused -e robots=off --tries 0 --waitretry 10 --timeout 30 --load-cookies cookies.txt" for (( n = $MYVIP_A; n <= $MYVIP_B; n++ )) do WARC_NAME=myvip_com_user_$n rm list acq_list 2>/dev/null echo "-------------------------------------------------------------------------------" unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS echo "> Fetching user page $n..." fetch `echo "http://myvip.com/profile.php?uid=$n"` "temp1" # initial grab of user page if [ `grep "Törölt, vagy nem létező felhasználó!" temp1 | wc -l | cut -d" " -f 1` -ne 0 ]; then # if profile doesn't exist echo "> User profile doesn't exist, saving empty page..." echo ";;;;;http://myvip.com/profile.php?uid=$n" > $WARC_NAME.csv wpull $WPULL_OPTS --warc-file $WARC_NAME "http://myvip.com/profile.php?uid=$n" # actual content grab if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then abort_wpull $n $WARC_NAME exit 1 fi echo "> Empty profile page $n archived." else # if user page exists echo "http://myvip.com/profile.php?uid=$n" >> list # it will be grabbed # In the following lines, we parse the profile page for some identification information. Those of everyone will be put in an index so that if one looks for their profile, they can easily find them. Multiple fields are necessary because several people may have the same name, and not everyone fill in all the fields. The index can be hidden or truncated later; the script should build it anyway. # We'll use semicolon as field separator, so we replace the possible semicolons with commas MYVIP_NAME=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>név:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>név:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/;/,/g"` MYVIP_NICKNAME=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>becenév:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>becenév:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/;/,/g"` MYVIP_BIRTHDATE=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>születési idő:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>születési idő:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | cut -d" " -f1-3` MYVIP_PERM_ADDRESS=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>lakhely:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>lakhely:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/>/>/g" | cut -d">" -f 3 | cut -d" " -f 2- | sed "s/;/,/g"` MYVIP_TEMP_ADDRESS=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>tartózkodási hely:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>tartózkodási hely:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/>/>/g" | cut -d">" -f 3 | cut -d" " -f 2- | sed "s/;/,/g"` MYVIP_URL="http://myvip.com/profile.php?uid=$n" echo "$MYVIP_NAME;$MYVIP_NICKNAME;$MYVIP_BIRTHDATE;$MYVIP_PERM_ADDRESS;$MYVIP_TEMP_ADDRESS;$MYVIP_URL" | sed "s/",/\"/g" | sed "s/&,/&/g" | sed "s/<,/</g" | sed "s/>,/>/g" > $WARC_NAME.csv # decoding special characters; they go to a semicolon-seperated file echo "> Profile for user '$MYVIP_NAME' indexed." grep "loaded-image-userprofile_avatar" temp1 | grep -o "http[0-9a-zA-Z/\.?:_]*" | uniq | sed "s/\\\//g" >> list # avatar pic if [ `grep -o "onclick='profile_gotopage(\"\",[0-9],[0-9]*); return false' class='rangepager-jump rangepager-jump-last'>" temp1 | wc -l | cut -d" " -f 1` -gt 0 ]; then NUMCLUBS=`grep -o "onclick='profile_gotopage(\"\",[0-9],[0-9]*); return false' class='rangepager-jump rangepager-jump-last'>" temp1 | cut -d"," -f 2` # counting clublist pages if [[ ! $NUMCLUBS = "" ]]; then echo "> Parsing for club avatars..." for (( i = 0; i <= $NUMCLUBS; i++)) do echo -n $(($NUMCLUBS-$i))... # print progress echo "http://myvip.com/profile.php?act=getclubs&page=$i&uid=$n" >> list # adding them to list fetch `echo "http://myvip.com/profile.php?act=getclubs&page=$i&uid=$n"` "temp2" # fetching to discover clubavatars grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp2 | cut -d'"' -f 2 >> avatars/clubav_$n done echo else grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/clubav_$n fi else grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/clubav_$n fi grep -o "images.php?uid=[0-9]\+&imageid=[0-9]\+#imageview_container" temp1 | cut -d "'" -f 2 | sed "s/images\.php/http:\/\/myvip\.com\/images\.php/g" >> list # links to pictures on profile page if [ `grep "dousercontacts" temp1 | wc -l | cut -d" " -f 1` -eq 0 ]; then # does the user have acquaintances? echo "> User has no acquaintances." MAXPAGE=-1 else echo "> Discovering acquaintances..." fetch `echo "http://myvip.com/search.php?act=dousercontacts&uid=$n"` "temp1" # grabbing acq. list for discovering number of acq. pages if [ `grep "rangepager-jump rangepager-jump-last rangepager-jump-disabled" temp1 | wc -l | cut -d" " -f 1` -eq 0 ]; then # does the acq. list have more than one page? MAXPAGE=`grep "rangepager-jump rangepager-jump-last" temp1 | uniq | rev | cut -d"&" -f 1 | rev | cut -d"=" -f 2 | cut -d"'" -f 1` # number of acq. pages PAGER_PREFIX=`grep "rangepager-jump rangepager-jump-last" temp1 | uniq | rev | cut -d"'" -f 4 | rev | cut -d "&" -f 1-2` # url prefix for acq. pages, including a unique pager id for (( i = 0; i <= $MAXPAGE; i++ )) do echo -n $(($MAXPAGE-$i))... # print progress echo "http://myvip.com/$PAGER_PREFIX&p=$i" >> acq_list # urls for acquaintances pages. WE'LL MODIFY AND GRAB LATER! fetch `echo "http://myvip.com/$PAGER_PREFIX&p=$i"` "temp2" # discovering profile avatars grep -o "img src=\"http://avatar\.myvip\.com/avatars/users[^\"]*\"" temp2 | cut -d'"' -f 2 >> avatars/profav_$n done echo else grep -o "img src=\"http://avatar\.myvip\.com/avatars/users[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/profav_$n MAXPAGE=0 echo "http://myvip.com/browse.php?act=browse&pager=phant0mpag3r1d3nt1f13r&p=0" >> acq_list fi #echo "> Found $(( $MAXPAGE + 1 )) pages of acquaintances." # We've already printed progress, deprecated fi echo "> Discovering images..." echo "http://myvip.com/images.php?uid=$n" >> list fetch `echo "http://myvip.com/images.php?uid=$n"` "temp1" # fetching images page for discovery if [ `grep "A felhasználónak nincs nyilvános albuma!" temp1 | wc -l | cut -d" " -f 1` -ne 0 ]; then # does the user have images? echo "> User has no public images." else grep "images.php?albumid" temp1 | cut -d'"' -f 2 | cut -d"/" -f 2 | uniq > temp2 # collecting direct album links' postfixes # cut -d"=" -f 2 temp2 | cut -d"&" -f 1 > albumids_$n # collecting albumids (probably not necessary) echo "> User has `wc -l temp2 | cut -d" " -f 1` public albums." sed "s/images\.php/http:\/\/myvip\.com\/images\.php/g" temp2 >> list # add myvip.com prefix grep -o "/images.php?uid=[0-9]\+&albumid=[0-9]\+&imageid=[0-9]\+&getcontent=album&isajax=1" temp1 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" >> list # collecting browser thumbnail album links grep -o "'/images.php?uid=[0-9]\+&albumid=[0-9]\+'" temp1 | cut -d "'" -f 2 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" | uniq >> list # get other kind of direct links to albums grep -o "/images.php?uid=[0-9]\+&albumid=[0-9]\+&getcontent=album&isajax=1" temp1 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" | uniq > temp3 # collecting browser album links cat temp3 >> list # we'll grab them too NUMALBUMS=`wc -l temp3 | cut -d" " -f 1` for (( h = 1; h <= $NUMALBUMS; h++)) do echo "> Discovering content of album $h/$NUMALBUMS..." fetch `head -$h temp3 | tail -1` "temp4" # fetch albums' embedded pages grep -o "/images.php?uid=[0-9]\+&imageid=[0-9]\+&getcontent=img&isajax=1" temp4 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" > temp5 # collect image page postfixes & add myvip.com prefix echo "> User has `wc -l temp5 | cut -d" " -f 1` images in this album." cat temp5 >> list # add them to list grep -o "<div class=\"thumbnail-commentcnt\">[^<]*</div>" temp4 | cut -d">" -f 2 | cut -d"<" -f 1 > temp41 # list of number of comments grep -o "/images.php?uid=[0-9]\+&imageid=[0-9]\+&getcontent=img&isajax=1" temp4 > temp42 # list of image pages, in the same order ALBUMID=`head -$h temp3 | tail -1 | grep -o "albumid=[0-9]\+" | cut -d"=" -f 2` NUMIMAGES=`wc -l temp41 | cut -d" " -f 1` for (( i = 1; i <= $NUMIMAGES; i++)) do if [ `head -$i temp41 | tail -1` -gt 20 ]; then echo "http://myvip.com/images.php?imageid=`head -$i temp42 | tail -1 | cut -d'&' -f 2 | cut -d'=' -f 2`&albumid=$ALBUMID&uid=$n&isajax=1&getcontent=comments" >> list # get comments fi done done fi echo "> Downloading discovered content..." wpull $WPULL_OPTS --warc-file $WARC_NAME -i list # actual content grab if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then abort_wpull $n $WARC_NAME exit 1 fi if [ $MAXPAGE -ne -1 ]; then echo "> Downloading acquaintances pages" OLDPAGERID=`head -1 acq_list | grep -o "pager=[0-9a-z]*" | cut -d"=" -f 2` echo "http://myvip.com/search.php?act=dousercontacts&uid=$n" > list # one URL to find out current pager ID wpull $WPULL_OPTS --warc-file $WARC_NAME -i list if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then abort_wpull $n $WARC_NAME exit 1 fi if [ `grep "Fetching ‘http://myvip.com/browse.php?pager=[0-9a-z]*&p=0’ encountered an error" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then echo "> A rare problem occured. Grab of this user profile must be restarted." rm acq_list temp1 temp2 temp3 temp4 temp41 temp42 temp5 list db $WARC_NAME.warc.gz $WARC_NAME.csv log.txt avatars/profav_$n avatars/clubav_$n 2>/dev/null ((n--)) cat STOP 2>/dev/null && ((n=$MYVIP_B)) continue fi NEWPAGERID=`grep "pager" log.txt | tail -1 | cut -d"=" -f 2 | cut -d"&" -f 1` sed -i -e "s/$OLDPAGERID/$NEWPAGERID/g" acq_list mv acq_list list wpull $WPULL_OPTS --warc-file $WARC_NAME -i list # needed so that wpull surely uses the old database if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then abort_wpull $n $WARC_NAME exit 1 fi fi echo "> myVIP profile of user '$MYVIP_NAME' (id $n) has been successfully archived!" fi rm temp1 temp2 temp3 temp4 temp41 temp42 temp5 list db 2>/dev/null mv $WARC_NAME.warc.gz warcs mv $WARC_NAME.csv index mv log.txt logs/log_$n.txt cat STOP 2>/dev/null && ((n=$MYVIP_B)) # if STOP file is present, we stop the loop done unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS unset MYVIP_A MYVIP_B WPULL_OPTS WGET_OPTS rm myvip_script_lock exit 0