#!/usr/bin/perl -w # Copyright © 2001, 2003, 2004, 2005 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 13-Jun-2001. # # Reads from a URL, saving it to a file. Saves N hours worth of data # (by the wall clock, not by bytes-read.) # # - if an error occurs on output, exit (assuming it means "disk full") # - if an error occurs on input, keep trying (assuming "network lossage") # # flock() is used to prevent two copies of archiver.pl from writing to the # same file. require 5; use diagnostics; use strict; use Socket; require POSIX; use Fcntl ':flock'; # import LOCK_* constants my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.18 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; my $audio_suffix = "mp3"; my $file_bytes_per_second = 16000; # the magic number for 128k mp3 files... # When there is an input error, lay off for this long before trying again. # my $read_timeout = 60; # seconds my $start_time = undef; my $stop_time = undef; my $in_url = undef; my $out_dir = undef; my $out_filename = undef; my $in_open_p = 0; my $out_open_p = 0; my $read_buffer = ''; my $read_buffer_length = 0; my $total_bytes_written = 0; my $alarming = 0; local *IN; local *OUT; sub blurb { my ($sec, $min, $hour) = localtime; return "$progname: " . sprintf("%02d:%02d:%02d: ", $hour, $min, $sec); } sub error { my ($msg) = @_; print STDERR blurb() . $msg . "\n"; system "logger", "-t", $progname, $msg; exit 1; } # Opens the URL and swallows the header. The body is available on . # If an error happened, $in_open_p will be 0. # my $reopen_count = 0; sub open_url { my ($in_url) = @_; $in_open_p = 0; my($url_proto, $dummy, $serverstring, $path) = split(/\//, $in_url, 4); if (! ($url_proto && $url_proto =~ m/^http:$/i)) { error "not an HTTP URL: $in_url"; } $path = "" unless $path; my ($them,$port) = split(/:/, $serverstring); $port = 80 unless $port; print STDERR blurb() . "connecting to $serverstring...\n" if (($verbose > 1) || ($verbose == 1 && $reopen_count == 0)); $reopen_count++; my ($remote, $iaddr, $paddr, $proto, $line); $remote = $them; if ($port =~ /\D/) { $port = getservbyname($port, 'tcp') } $port || error "no such service: $port/tcp"; $iaddr = inet_aton($remote); if (!$iaddr) { print STDERR blurb() . "no such host: $remote\n"; return; } $paddr = sockaddr_in($port, $iaddr); $proto = getprotobyname('tcp'); if (!socket(IN, PF_INET, SOCK_STREAM, $proto)) { print STDERR blurb() . "socket: $!\n"; return; } if (!connect(IN, $paddr)) { print STDERR blurb() . "connecting to $serverstring: $!\n"; return; } select(IN); $| = 1; select(STDOUT); print IN ("GET /$path HTTP/1.0\r\n" . "Host: $them\r\n" . "User-Agent: $progname/$version\r\n" . "\r\n"); my $http = || 'EOF'; $http =~ s/[\r\n]+$//s; if (! ($http =~ m@^(HTTP/\d.\d+|ICY) 2\d\d\b@)) { print STDERR blurb() . "$http\n"; return; } # skip over http header data # while () { s/\r\n$/\n/; last if m@^\n@; } print STDERR blurb() . "connected ($http)\n" if ($verbose > 1); $in_open_p = 1; } sub open_in { my $countdown = 4000; close_in() if $in_open_p; while (!$in_open_p) { open_url ($in_url); if (! $in_open_p) { if (--$countdown <= 0) { error ("still unable to open stream. bailing!"); } else { sleep 5; } } } } sub close_in { print STDERR blurb() . "closing $in_url..." if ($verbose > 1); if ($in_open_p) { print STDERR "\n" if ($verbose > 1); } else { print STDERR " already closed.\n" if ($verbose > 1); return; } # close() requires the other side of the connection to ACK the close. # apparently we were hanging here. So instead, let's use shutdown(), # which doesn't wait for the ACK. # close (IN); shutdown (IN, 2); $in_open_p = 0; # print STDERR blurb() . "closed\n" if ($verbose > 1); } # Close the output file if it is open. # Exits if there is an error. # sub close_out { return unless $out_open_p; flock(OUT, LOCK_UN) || error ("unlocking $out_filename: $!\n"); close OUT || error ("closing $out_filename: $!\n"); $out_open_p = 0; my $elapsed = time - $start_time; my $written = int($total_bytes_written / $file_bytes_per_second); my $missing = $elapsed - $written; if ($verbose || $missing > 60) { print STDERR blurb() . "closed $out_filename: "; print STDERR sprintf("%d:%02d:%02d", int ($written / (60 * 60)), int (($written / 60) % 60), int ($written % 60)); print STDERR "\n"; } if ($missing > 60) { print STDERR blurb() . "WARNING: $out_filename is short!\n"; print STDERR blurb() . "WARNING:"; print STDERR sprintf("%d:%02d:%02d elapsed, ", int ($elapsed / (60 * 60)), int (($elapsed / 60) % 60), int ($elapsed % 60)); print STDERR sprintf("%d:%02d:%02d missing!\n", int ($missing / (60 * 60)), int (($missing / 60) % 60), int ($missing % 60)); } $total_bytes_written = 0; } # Returns a filename to use for the current date, # of the form "$out_dir/YYYY/MM-DD.mp3". # creates the "YYYY" directory if it doesn't exist. # sub make_filename { my ($time, $suffix) = @_; error "directory $out_dir does not exist" unless (-d $out_dir); my ($sec, $min, $hour, $dotm, $mon, $year) = localtime($time); $year += 1900; $mon++; my $dir = "$out_dir/$year"; if (! -d $dir) { mkdir ($dir, 0777) || error "mkdir $dir: $!"; print STDERR blurb() . "created directory $dir\n" if ($verbose); } my $prefix = sprintf ("$dir/%02d-%02d", $mon, $dotm); my $file = "$prefix.$suffix"; my $counter = 0; while (-f $file) { my $chr = chr ((++$counter) + ord('a')); $file = "$prefix$chr.$suffix"; } if ($counter > 0 && $suffix eq $audio_suffix) { my $f = $file; $f =~ s@^.*/([^/]*)$@$1@; print STDERR blurb() . "WARNING: $prefix.$suffix exists: using $f.\n"; } return $file; } # Opens a new output file whose name is based on the current time. # Leaves it in . # sub open_out { error ("output file $out_filename is already open!") if ($out_open_p); $out_filename = make_filename(time, $audio_suffix); if (-f $out_filename) { print STDERR blurb() . "WARNING: $out_filename exists: appending.\n"; } open (OUT, ">>$out_filename") || error ("opening $out_filename: $!\n"); flock(OUT, LOCK_EX) || error ("locking $out_filename: $!\n"); $out_open_p = 1; $total_bytes_written = 0; print STDERR blurb() . "writing $out_filename\n" if $verbose; # Write a zero-length file whose purpose is to hold the date at which # we *started* writing data (the write date of the file in OUT will be # the date we *stopped* writing data.) # my $time_file = $out_filename; $time_file =~ s@\.[^./]+$@.time@; if (! -f $time_file) { local *OUT2; open (OUT2, ">$time_file") || error ("opening $time_file: $!\n"); close OUT2; print STDERR blurb() . "touched $time_file\n" if ($verbose > 1); } } # Write the contents of $read_buffer to , exit if there is an error. # sub write_data { my $written; my $off = 0; my $len = $read_buffer_length; while ($len > 0) { $written = syswrite (OUT, $read_buffer, $len, $off); if (!defined($written) || $written < 0) { error ("writing $len bytes to $out_filename: $!\n"); } print STDERR blurb() . " wrote $written bytes of $len\n" if ($verbose > 3); $off += $written; $len -= $written; $total_bytes_written += $written; } print STDERR blurb() . " wrote $read_buffer_length bytes\n" if ($verbose == 3); $read_buffer_length = 0; } # Reads data from and writes it to , opening the file first # if necessary. # sub copy_data { await_input(); $read_buffer_length = sysread (IN, $read_buffer, 10240); check_alarm(); print STDERR blurb() . " read $read_buffer_length bytes\n" if ($verbose > 2); if (!defined($read_buffer_length) || $read_buffer_length <= 0) { $read_buffer_length = 0; print STDERR blurb() . "EOF on input URL\n" if ($verbose); close_in(); } else { write_data(); } return $read_buffer_length; } # Wait for there to be data available on IN. # Opens IN first if necessary. # Note that if $read_timeout seconds elapse in this function, then # error_alarm will go off and close IN, and this function will # re-open it and try again. # sub await_input { my $timeout = 5; while (1) { open_in() unless $in_open_p; my ($rin, $win, $ein, $rout, $wout, $eout); $rin = $win = $ein = ''; vec($rin,fileno(IN),1) = 1; $ein = $rin | $win; my ($nfound, $timeleft) = select($rout=$rin, $wout=$win, $eout=$ein, $timeout); check_alarm(); if ($verbose > 3 && $nfound <= 0) { print STDERR blurb() . " $timeout seconds with no input! Reconnecting...\n"; } elsif ($verbose > 4) { print STDERR blurb() . " input available\n"; } last if ($nfound > 0); } } # If the network stream stalls, this gets signalled, and the current # input stream is closed, and a new connection will be made. # We close it in check_alarm() instead of in this function because # signal handlers really don't like calling malloc(). # sub error_alarm { $alarming = 1; # This is dangerous -- malloc on the signal stack might blow up. # print STDERR "\n" . blurb() . "ALARM!\n\n" if ($verbose > 3); } sub check_alarm { return unless $alarming; $alarming = 0; print STDERR blurb() . "timeout: $read_timeout seconds with no data!\n"; close_in(); } # Opens a URL, and reads from it # Read from and write to . # Re-opens if there is a read error, or if bits stop arriving. # Exits if there is ever an error writing (e.g., a full disk.) # sub read_loop { open_out(); $start_time = time; $SIG{ALRM} = \&error_alarm; print STDERR blurb() . "installed alarm handler\n" if ($verbose > 3); my $got_data = 1; while (1) { if ($got_data) { alarm $read_timeout; # reset timeout before each read. print STDERR blurb() . "reset alarm for $read_timeout seconds\n" if ($verbose > 3); } $got_data = copy_data(); check_time(); } } # Tick the write-date on the output file. # Since we do comparisons of the dates of the .time file with the .mp3 file, # it's important that the .mp3 file have a date of the time at which we stopped # writing to it (even if, due to network lossage, we haven't actually written # any bytes to it in a while.) Otherwise, we don't notice that there are # missing bytes in the file at the *end*. # sub touch_file { error ("no out filename set?") unless (defined($out_filename)); local *OUT; open (OUT, ">>$out_filename") || error ("touching $out_filename: $!\n"); close OUT; } # If the current wall-clock time has passed our stop time, bail. # sub check_time { my $now = time; check_alarm(); if ($now >= $stop_time) { close_out(); touch_file(); exit(0); } } # parses "30 secs", "30s", "3 hours", "3h", etc and returns seconds. # sub parse_secs { my ($secs) = @_; if ($secs =~ m/^(\d+)$/) { $secs = 0 + $1; } elsif ($secs =~ m/^(\d+)s*(s|secs?|seconds?)$/i) { $secs = (0 + $1); } elsif ($secs =~ m/^(\d+)s*(m|mins?|minutes?)$/i) { $secs = (0 + $1) * 60; } elsif ($secs =~ m/^(\d+)s*(h|hrs?|hours?)$/i) { $secs = (0 + $1) * 60 * 60; } elsif ($secs =~ m/^(\d\d?):(\d\d):(\d\d)$/) { $secs = ($1 * 60 * 60) + ($2 * 60) + $3; } else { error "\"$secs\" unparsable: try 'HH:MM:SS'\n"; } return $secs; } sub write_desc_file { my ($desc) = @_; $desc =~ s@\\n@\n@gs; $desc =~ s@^([^\s*]+?\.html\n)@@si; # take off flyer info on first line my $flyer = $1; $desc =~ s@^([^a-z\n]+)$@$1@mg; # boldify all-caps lines $desc =~ s@\n@\n@sgi; # let caps span lines $desc =~ s@^\*(.+)$@$1@mg; # boldify "*" lines $desc =~ s@\n@\n
@gs; # convert newlines to
$desc =~ s@\n*
\n*$@@s; # delete trailing
s $desc .= "\n"; # end file with newline $desc = "$flyer$desc" if defined ($flyer); # put flyer back my $desc_file = make_filename (time, "desc"); if (-f $desc_file) { print STDERR blurb() . "WARNING: $desc_file exists: unchanged!\n"; return; } open (OUT, ">>$desc_file") || error ("opening $desc_file: $!\n"); print OUT $desc; close (OUT); } sub usage { print STDERR "usage: $progname " . "[--verbose] in-url out-directory total-duration\n"; exit 1; } sub main { $in_url = undef; $out_dir = undef; my $tdur = undef; my $desc = undef; while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif ($_ eq "--desc") { $desc = shift @ARGV; } elsif (m/^-./) { usage; } elsif (!defined($in_url)) { $in_url = $_; } elsif (!defined($out_dir)) { $out_dir = $_; } elsif (!defined($tdur)) { $tdur = $_; } else { usage; } } usage unless defined($in_url); usage unless defined($out_dir); usage unless defined($tdur); $out_dir = "." if ($out_dir eq ""); $out_dir =~ s@/+$@@; # lose trailing slash $tdur = parse_secs ($tdur); error "total-duration $tdur secs sucks, man" if ($tdur < 5); if ($verbose > 1) { my $h = int($tdur) / 60 / 60; my $m = (int($tdur) / 60) % 60; my $s = int($tdur) % 60; my $hms = sprintf ("%d:%02d:%02d", $h, $m, $s); print STDERR blurb() . "total duration: $tdur seconds ($hms)\n"; } $stop_time = time() + $tdur; write_desc_file ($desc) if ($desc); read_loop(); } main; exit 0;