MyVIP/Bash script

From Archiveteam
< MyVIP
Revision as of 11:49, 3 October 2015 by Bzc6p (talk | contribs) (Created page with "MyVIP archiving bash script, written by user:bzc6p. Needs to be rewritten to conform ArchiveTeam framework and standards. <pre> #!/bin/bash # Discovers and downloads user...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

MyVIP archiving bash script, written by user:bzc6p. Needs to be rewritten to conform ArchiveTeam framework and standards.

#!/bin/bash
# Discovers and downloads user content belonging to given user ID
# Accepts one or two paramters: a single id or two ids, in the latter case does the range.
# Creates a WARC file with the profile content and a csv file with one line containing some identifying information about the user.
# Avatar pictures' links are collected for future downloading.

abort_wpull ()	# if wpull is redirected, content is wrong and therefore we shouldn't go on
{
  echo "> Wpull grabbed wrong pages last time, you probably have lost authentication or something other weird happened. Check the logs before going on."
  echo "> Aborted."
  rm temp1 temp2 temp3 temp4 temp41 temp42 temp5 acq_list list db myvip_script_lock 2>/dev/null
  mkdir ERROR 2>/dev/null
  mv $2.warc.gz ERROR
  mv $2.csv ERROR
  mv log.txt ERROR/log_$1.txt
  rm avatars/*av_$1
  unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS
  unset MYVIP_A MYVIP_B WPULL_OPTS WGET_OPTS
}

fetch ()     # In case wget has a DNS error (doesn't retry) or we've lost authentication.
{
    while [ true ]
    do
        wget $WGET_OPTS -O $2 $1 || { echo "> Probably an error in the connection. Sleeping 1 minute..."; sleep 60; continue; }     
        if [ `grep "<span class=\"btn-text\">Bejelentkezés</span>" $2 | wc -l | cut -d" " -f 1` -gt 0 ]; then
            echo "> You have lost your authentication! Log in and export your cookies file again!"
            echo "> Sleeping 1 minute..."
            sleep 60
            echo "> Retrying..."
            continue
        fi
        break
    done
}

echo "*** myVIP user backup script ***"
[[ $1 =~ `echo "^[0-9]+$"` ]] || { echo "> First parameter wrong!"; echo "> Aborted."; exit 1; }
[[ -z $2 ]] || [[ $2 =~ `echo "^[0-9]+$"` ]] || { echo "> Second parameter wrong!"; echo "> Aborted."; exit 1; }
[[ -z $2 ]] || [[ $1-$2 -le 0 ]] || { echo "> Parameters wrong!"; echo "> Aborted."; exit 1; }
echo "> Looking for wpull..."
wpull --version > wpull_ver 2>/dev/null || { echo "> You don't have wpull installed! wpull is necessary for the script to run!"; echo "> Aborted"; rm wpull_ver; exit 1; }
[ `cat wpull_ver | cut -d"." -f 1` -lt 1 ] && { echo "> Your wpull version is too old (`cat wpull_ver`). The script needs at least wpull version 1.2 to run."; echo "> Aborted."; rm wpull_ver; exit 1; }
[ `cat wpull_ver | cut -d"." -f 1` -eq 1 -a `cat wpull_ver | cut -d"." -f 2` -lt 2 ] && { echo "> Your wpull version is too old (`cat wpull_ver`). The script needs at least wpull version 1.2 to run."; echo "> Aborted."; rm wpull_ver; exit 1; }
rm wpull_ver
echo "> Checking authentication..."
if [ `wget --load-cookies cookies.txt -q -O - http://myvip.com/profile.php | grep "Adatlap" | wc -l | cut -d" " -f 1` -lt 1 ]; then
    echo "> Authentication failed. Check your cookies file or your internet connection."; echo "> Aborted."; exit 1
fi
cat myvip_script_lock >/dev/null 2>/dev/null && { echo "> Another myVIP backup script seems to be running! Multiple instances of the script MUST NOT be run at the same time!"; echo "> It is possible though that the last run interrupted. If you are sure no other myVIP backup script is running, issue 'rm myvip_script_lock' and retry."; echo "> Aborted."; exit 1; }
touch myvip_script_lock
mkdir avatars warcs logs index 2>/dev/null
MYVIP_A=$1
if [[ -z $2 ]]; then
    MYVIP_B=$1
    echo "> Backing up myVIP user profile $MYVIP_A"
else
    MYVIP_B=$2
    echo "> Backing up myVIP user profiles ${MYVIP_A}–${MYVIP_B}"
fi
WPULL_OPTS="--exclude-domains static.myvip.com,avatar.myvip.com --reject-regex infobar_frame|banner_bottombanner_frame -a log.txt --retry-connrefused --retry-dns-error --tries inf --waitretry 10 --timeout 30 --no-robots --progress none --load-cookies cookies.txt -p -H -Dmyvip.com --no-warc-keep-log --delete-after --database db --warc-append"      # options for wpull
WGET_OPTS="-q -a log.txt --retry-connrefused -e robots=off --tries 0 --waitretry 10 --timeout 30 --load-cookies cookies.txt"
for (( n = $MYVIP_A; n <= $MYVIP_B; n++ ))
do
    WARC_NAME=myvip_com_user_$n
    rm list acq_list 2>/dev/null
    echo "-------------------------------------------------------------------------------"
    unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS
    echo "> Fetching user page $n..."
    fetch `echo "http://myvip.com/profile.php?uid=$n"` "temp1"  # initial grab of user page  
    if [ `grep "Törölt, vagy nem létező felhasználó!" temp1 | wc -l | cut -d" " -f 1` -ne 0 ]; then       # if profile doesn't exist
	echo "> User profile doesn't exist, saving empty page..."
	echo ";;;;;http://myvip.com/profile.php?uid=$n" > $WARC_NAME.csv
	wpull $WPULL_OPTS --warc-file $WARC_NAME "http://myvip.com/profile.php?uid=$n"     # actual content grab
	if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then
	  abort_wpull $n $WARC_NAME
	  exit 1
	fi
	echo "> Empty profile page $n archived."
    else        # if user page exists
        echo "http://myvip.com/profile.php?uid=$n" >> list       # it will be grabbed
        # In the following lines, we parse the profile page for some identification information. Those of everyone will be put in an index so that if one looks for their profile, they can easily find them. Multiple fields are necessary because several people may have the same name, and not everyone fill in all the fields. The index can be hidden or truncated later; the script should build it anyway.
        # We'll use semicolon as field separator, so we replace the possible semicolons with commas
        MYVIP_NAME=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>név:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>név:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/;/,/g"`
        MYVIP_NICKNAME=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>becenév:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>becenév:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/;/,/g"`
        MYVIP_BIRTHDATE=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>születési idő:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>születési idő:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | cut -d" " -f1-3`
        MYVIP_PERM_ADDRESS=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>lakhely:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>lakhely:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/>/>/g" | cut -d">" -f 3 | cut -d" " -f 2- | sed "s/;/,/g"`
        MYVIP_TEMP_ADDRESS=`grep -o "<span style='width:[0-9]*px;' class='pairs-key'>tartózkodási hely:</span><span style='margin-left:[0-9]*px;' class='pairs-value'>[^<]*</span>" temp1 | sed "s/<span style='width:[0-9]*px;' class='pairs-key'>tartózkodási hely:<\/span><span style='margin-left:[0-9]*px;' class='pairs-value'>//g" | sed "s/<\/span>//g" | sed "s/>/>/g" | cut -d">" -f 3 | cut -d" " -f 2- | sed "s/;/,/g"`
        MYVIP_URL="http://myvip.com/profile.php?uid=$n"
        echo "$MYVIP_NAME;$MYVIP_NICKNAME;$MYVIP_BIRTHDATE;$MYVIP_PERM_ADDRESS;$MYVIP_TEMP_ADDRESS;$MYVIP_URL" | sed "s/&quot,/\"/g" | sed "s/&amp,/&/g" | sed "s/&lt,/</g" | sed "s/&gt,/>/g" > $WARC_NAME.csv       # decoding special characters; they go to a semicolon-seperated file
        echo "> Profile for user '$MYVIP_NAME' indexed."
        grep "loaded-image-userprofile_avatar" temp1 | grep -o "http[0-9a-zA-Z/\.?:_]*" | uniq | sed "s/\\\//g" >> list     # avatar pic
        if [ `grep -o "onclick='profile_gotopage(\"\",[0-9],[0-9]*); return false' class='rangepager-jump rangepager-jump-last'>" temp1 | wc -l | cut -d" " -f 1` -gt 0 ]; then
            NUMCLUBS=`grep -o "onclick='profile_gotopage(\"\",[0-9],[0-9]*); return false' class='rangepager-jump rangepager-jump-last'>" temp1 | cut -d"," -f 2`       # counting clublist pages
            if [[ ! $NUMCLUBS = "" ]]; then
		echo "> Parsing for club avatars..."
                for (( i = 0; i <= $NUMCLUBS; i++))
                do
		    echo -n $(($NUMCLUBS-$i))...	# print progress
                    echo "http://myvip.com/profile.php?act=getclubs&page=$i&uid=$n" >> list		# adding them to list
                    fetch `echo "http://myvip.com/profile.php?act=getclubs&page=$i&uid=$n"` "temp2"	# fetching to discover clubavatars
                    grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp2 | cut -d'"' -f 2 >> avatars/clubav_$n
                done
                echo
            else
		grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/clubav_$n
            fi
        else
	    grep -o "img src=\"http://avatar\.myvip\.com/avatars/clubs[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/clubav_$n
        fi
        grep -o "images.php?uid=[0-9]\+&imageid=[0-9]\+#imageview_container" temp1 | cut -d "'" -f 2 | sed "s/images\.php/http:\/\/myvip\.com\/images\.php/g" >> list   # links to pictures on profile page
        if [ `grep "dousercontacts" temp1 | wc -l | cut -d" " -f 1` -eq 0 ]; then     # does the user have acquaintances?
            echo "> User has no acquaintances."
            MAXPAGE=-1
        else
            echo "> Discovering acquaintances..."
            fetch `echo "http://myvip.com/search.php?act=dousercontacts&uid=$n"` "temp1"      # grabbing acq. list for discovering number of acq. pages  
            if [ `grep "rangepager-jump rangepager-jump-last rangepager-jump-disabled" temp1 | wc -l | cut -d" " -f 1` -eq 0 ]; then      # does the acq. list have more than one page?
                MAXPAGE=`grep "rangepager-jump rangepager-jump-last" temp1 | uniq | rev | cut -d"&" -f 1 | rev | cut -d"=" -f 2 | cut -d"'" -f 1`        # number of acq. pages
                PAGER_PREFIX=`grep "rangepager-jump rangepager-jump-last" temp1 | uniq | rev | cut -d"'" -f 4 | rev | cut -d "&" -f 1-2`     # url prefix for acq. pages, including a unique pager id
                for (( i = 0; i <= $MAXPAGE; i++ ))
                do
		    echo -n $(($MAXPAGE-$i))...	# print progress
                    echo "http://myvip.com/$PAGER_PREFIX&p=$i" >> acq_list       # urls for acquaintances pages. WE'LL MODIFY AND GRAB LATER!
                    fetch `echo "http://myvip.com/$PAGER_PREFIX&p=$i"` "temp2" 	# discovering profile avatars
                    grep -o "img src=\"http://avatar\.myvip\.com/avatars/users[^\"]*\"" temp2 | cut -d'"' -f 2 >> avatars/profav_$n
                done
                echo
            else
		grep -o "img src=\"http://avatar\.myvip\.com/avatars/users[^\"]*\"" temp1 | cut -d'"' -f 2 >> avatars/profav_$n
                MAXPAGE=0
                echo "http://myvip.com/browse.php?act=browse&pager=phant0mpag3r1d3nt1f13r&p=0" >> acq_list
            fi
            #echo "> Found $(( $MAXPAGE + 1 )) pages of acquaintances."	# We've already printed progress, deprecated
        fi
        echo "> Discovering images..."
        echo "http://myvip.com/images.php?uid=$n" >> list
        fetch `echo "http://myvip.com/images.php?uid=$n"` "temp1"    # fetching images page for discovery
        if [ `grep "A felhasználónak nincs nyilvános albuma!" temp1 | wc -l | cut -d" " -f 1` -ne 0 ]; then       # does the user have images?
            echo "> User has no public images."
        else
            grep "images.php?albumid" temp1 | cut -d'"' -f 2 | cut -d"/" -f 2 | uniq > temp2      # collecting direct album links' postfixes
            # cut -d"=" -f 2 temp2 | cut -d"&" -f 1 > albumids_$n      # collecting albumids (probably not necessary)
            echo "> User has `wc -l temp2 | cut -d" " -f 1` public albums."
            sed "s/images\.php/http:\/\/myvip\.com\/images\.php/g" temp2 >> list      # add myvip.com prefix
            grep -o "/images.php?uid=[0-9]\+&albumid=[0-9]\+&imageid=[0-9]\+&getcontent=album&isajax=1" temp1 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" >> list      # collecting browser thumbnail album links
            grep -o "'/images.php?uid=[0-9]\+&albumid=[0-9]\+'" temp1 | cut -d "'" -f 2 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" | uniq >> list     # get other kind of direct links to albums
            grep -o "/images.php?uid=[0-9]\+&albumid=[0-9]\+&getcontent=album&isajax=1" temp1 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" | uniq > temp3      # collecting browser album links
            cat temp3 >> list     # we'll grab them too
            NUMALBUMS=`wc -l temp3 | cut -d" " -f 1`
            for (( h = 1; h <= $NUMALBUMS; h++))
            do
                echo "> Discovering content of album $h/$NUMALBUMS..."
                fetch `head -$h temp3 | tail -1` "temp4"      # fetch albums' embedded pages 
                grep -o "/images.php?uid=[0-9]\+&imageid=[0-9]\+&getcontent=img&isajax=1" temp4 | sed "s/\/images\.php/http:\/\/myvip\.com\/images\.php/g" > temp5      # collect image page postfixes & add myvip.com prefix
                echo "> User has `wc -l temp5 | cut -d" " -f 1` images in this album."
                cat temp5 >> list      # add them to list
                grep -o "<div class=\"thumbnail-commentcnt\">[^<]*</div>" temp4 | cut -d">" -f 2 | cut -d"<" -f 1 > temp41   # list of number of comments
                grep -o "/images.php?uid=[0-9]\+&imageid=[0-9]\+&getcontent=img&isajax=1" temp4 > temp42    # list of image pages, in the same order
                ALBUMID=`head -$h temp3 | tail -1 | grep -o "albumid=[0-9]\+" | cut -d"=" -f 2`
                NUMIMAGES=`wc -l temp41 | cut -d" " -f 1`
                for (( i = 1; i <= $NUMIMAGES; i++))
                do
                    if [ `head -$i temp41 | tail -1` -gt 20 ]; then
                        echo "http://myvip.com/images.php?imageid=`head -$i temp42 | tail -1 | cut -d'&' -f 2 | cut -d'=' -f 2`&albumid=$ALBUMID&uid=$n&isajax=1&getcontent=comments" >> list       # get comments
                    fi
                done
            done
        fi
        echo "> Downloading discovered content..."
	wpull $WPULL_OPTS --warc-file $WARC_NAME -i list     # actual content grab
	if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then
	  abort_wpull $n $WARC_NAME
	  exit 1
	fi
	if [ $MAXPAGE -ne -1 ]; then
	  echo "> Downloading acquaintances pages"
	  OLDPAGERID=`head -1 acq_list | grep -o "pager=[0-9a-z]*" | cut -d"=" -f 2`
	  echo "http://myvip.com/search.php?act=dousercontacts&uid=$n" > list	# one URL to find out current pager ID
	  wpull $WPULL_OPTS --warc-file $WARC_NAME -i list
	  if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then
	    abort_wpull $n $WARC_NAME
	    exit 1
	  fi
	  if [ `grep "Fetching ‘http://myvip.com/browse.php?pager=[0-9a-z]*&p=0’ encountered an error" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then
	    echo "> A rare problem occured. Grab of this user profile must be restarted."
	    rm acq_list temp1 temp2 temp3 temp4 temp41 temp42 temp5 list db $WARC_NAME.warc.gz $WARC_NAME.csv log.txt avatars/profav_$n avatars/clubav_$n 2>/dev/null
            ((n--))
	    cat STOP 2>/dev/null && ((n=$MYVIP_B))
	    continue
	  fi
	  NEWPAGERID=`grep "pager" log.txt | tail -1 | cut -d"=" -f 2 | cut -d"&" -f 1`
	  sed -i -e "s/$OLDPAGERID/$NEWPAGERID/g" acq_list
	  mv acq_list list
	  wpull $WPULL_OPTS --warc-file $WARC_NAME -i list	# needed so that wpull surely uses the old database
	  if [ `grep "index\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 -o `grep "homeent\.php" log.txt | wc -l | cut -d" " -f 1` -ne 0 ]; then
	    abort_wpull $n $WARC_NAME
	    exit 1
	  fi
	fi
	echo "> myVIP profile of user '$MYVIP_NAME' (id $n) has been successfully archived!"
    fi
    rm temp1 temp2 temp3 temp4 temp41 temp42 temp5 list db 2>/dev/null
    mv $WARC_NAME.warc.gz warcs
    mv $WARC_NAME.csv index
    mv log.txt logs/log_$n.txt
    cat STOP 2>/dev/null && ((n=$MYVIP_B))	# if STOP file is present, we stop the loop
done
unset MYVIP_NAME MYVIP_NICKNAME MYVIP_BIRTHDATE MYVIP_PERM_ADDRESS MYVIP_TEMP_ADDRESS MYVIP_URL MAXPAGE PAGE_PREFIX NUMALBUMS ALBUMID NUMIMAGES MYVIP_TEMP NEWPAGERID OLDPAGERID NUMCLUBS
unset MYVIP_A MYVIP_B WPULL_OPTS WGET_OPTS
rm myvip_script_lock
exit 0