# # This application is implemented in the Perl programming language. # # The application reads files from the file system and imports them to the # SIETS storage through HTTP POST interface using libcurl. # # The application receives file names as command line arguments. # # It also detects whether the file is a text file or a binary file by counting # whitespaces in it: if a file contains relatively less whitespaces, it is # considered to be a binary file, and if a file contains relatively more # whitespaces, it is considered to be a text file. # use HTTP::Request::Common; use LWP::UserAgent; use File::stat; # connection parameters $url = "http://127.0.0.1/cgi-bin/siets/api.cgi"; $storage = "test"; $user = "guest"; $passwd = "guest"; $encoding = "US-ASCII"; $REQUIRED_WHITESPACE_FRACTION = 0.12; if (@ARGV == 0) { print "Usage: [-r url] [-s storage] [-u user] [-p password] [-e encoding] files\n"; exit; } # read options for ($i = 0; $i < @ARGV; $i++) { if (substr($ARGV[$i], 0, 1) eq '-') { if ($i + 1 >= @ARGV) { last; } # no option value $opt = substr($ARGV[$i], 1, 1); $val = $ARGV[$i + 1]; if ($opt eq 'r') { $url = $val; } elsif ($opt eq 's') { $storage = $val; } elsif ($opt eq 'u') { $user = $val; } elsif ($opt eq 'p') { $passwd = $val; } elsif ($opt eq 'e') { $encoding = $val; } else { print "Unknown option: ", $ARGV[$i], "\n"; } $i++; } } $ua = LWP::UserAgent->new; # names of files to be imported are passed as arguments # process each of them for ($i = 0; $i < @ARGV; $i++) { # check if argument is option if (substr($ARGV[$i], 0, 1) eq '-') { if ($i + 1 >= @ARGV) { last; } # no option value $i++; next; } print "Reading file: '", $fn = $ARGV[$i], "'\n"; # open file if (open(f, $fn)) { # retrieve file information if ($st = stat(*f)) { if (($st->mode & S_IFMT) == S_IFREG) { print "\tSize: ", $st->size, " bytes\n"; # read all of it into memory # note: this sample program asumes all of file fits into memory # so if you need to work with larger files figure out something else if (sysread(*f, $buf, $st->size) == $st->size) { # see if it is text file # estimate that by counting whitespace in it: # natural language text in contrary to binary data # must contain significant portion of whitespace $nspaces = $buf =~ s/(\s)/$1/g; if ($nspaces >= $st->size * $REQUIRED_WHITESPACE_FRACTION) { # execute SIETS insert command through HTTP POST interface */ $response = $ua->request(POST $url, [ storage => $storage, command => 'insert', user => $user, password => $passwd, id => $fn, title => $fn, rate => 100, text => $buf, encoding => $encoding ]); if ($response->is_success && $response->content) { if ($response->content !~ //) { # simplified error check $response->content =~ /([^<]*)<\/docid>/; print "Document inserted: docid = $1\n"; } else { $response->content =~ /([^<]*)<\/code>/; print STDERR "Error returned from SIETS server: $1 - "; $response->content =~ /([^<]*)<\/text>/; print STDERR "$1\n"; } } else { print STDERR "Error connecting to SIETS server: ", $response->code, ' - ', $response->message, "\n"; } } else { print "\tBinary file: ignored\n"; } } else { print STDERR "Error reading file\n"; } } else { print STDERR "File '$fn' is not a regular file\n"; } } else { print STDERR "Filesystem error retrieving info on '$fn'\n"; } close(f); } else { print STDERR "Could not open file '$fn'\n"; } }