Mercurial > hg > lukkari
view update.sh @ 75:3d9e42477367
More improvements in the parsing and XML output.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 24 Oct 2012 07:33:28 +0300 |
parents | 2cfb0a7eac9b |
children | d1b65d9903ab |
line wrap: on
line source
#!/bin/sh URLSUFFIX=".htm" function parse() { # Create cache directories, if they do not exist OLDCACHEDIR="${CACHEDIR}old/" if test ! -d "${OLDCACHEDIR}"; then mkdir -p "${OLDCACHEDIR}" fi if test ! -d "${CACHEDIR}"; then mkdir -p "${CACHEDIR}" fi URLPREFIX="$1" CLASSFILE="$2" LISTFILE="$2.tmp" PATPREFIX="$3" if wget -q -O "$LISTFILE" "$URLPREFIX"; then perl -ne "if (/<a href=\"${PATPREFIX}([A-Z]{3}\d\S+)${URLSUFFIX}\">/) { print \"\$1\n\"; }" < "$LISTFILE" > "$CLASSFILE" echo -n "* Fetched classfile $CLASSFILE: " cat "$CLASSFILE" | wc -l fi if test -e "$CLASSFILE"; then cat "$CLASSFILE" | while read i; do parse=no INFILE="${CACHEDIR}${i}.html" ONFILE="${OLDCACHEDIR}${i}.html" DATAFILE="${i}.data" wget -q -O "${INFILE}.new" "${URLPREFIX}${PATPREFIX}${i}${URLSUFFIX}" if test -e "${INFILE}.new"; then # New data fetched, does old file exist? if test -e "$INFILE"; then # Yes, do a diff if ! diff -u "$INFILE" "$INFILE.new" > "$INFILE.diff"; then # There were differences, do a parse parse=yes mv "$INFILE" "$ONFILE" && \ mv "$INFILE.new" "$INFILE" else # No changes, apparently .. remove the new one rm -f "$INFILE.new" "$INFILE.diff" fi else # No old file, parse new data mv "$INFILE.new" "$INFILE" parse=yes fi fi # No new file fetched, does datafile exist? if test ! -e "${CACHEDIR}${DATAFILE}"; then # No, try to parse it if old file input exists parse=yes fi # Parsing of old data requested? if test "x$parse" = "xyes" -a -e "$INFILE"; then if test -e "${CACHEDIR}${DATAFILE}"; then mv "${CACHEDIR}${DATAFILE}" "${OLDCACHEDIR}${DATAFILE}" fi echo "Parsing $i" perl parsedata.pl -php "$INFILE" -o "${CACHEDIR}${DATAFILE}" perl parsedata.pl -xml "$INFILE" -o "${CACHEDIR}${i}.xml" fi done fi } CACHEDIR="cache/" parse "http://www.oamk.fi/tyojarjestykset/otek/luokat/" "classes.txt" "OR_" #parse "http://www.oamk.fi/~heikkim/riihi2/Oppilaat/" "classes.txt" "Ryh._" CACHEDIR="cache-next/" parse "http://www.oamk.fi/~heikkim/riihi2/Oppilaat/" "classes_next.txt" "Ryh._"