/* -*- Mode: C; indent-tabs-mode: nil; tab-width: 8 -*- Copyright © 2001-2006 Jamie Zawinski Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. No representations are made about the suitability of this software for any purpose. It is provided "as is" without express or implied warranty. Created: 19-Sep-2001. This program detects ands strips silence from MP3 files. "Silence" is defined as the average volume having been below a low threshold (around 10%) for at least two seconds. Contiguous blocks of silence that are longer than 60 seconds are deleted. Ten seconds of silence are left on either side of deleted blocks, to avoid clipping off subtle fades (out or in.) As a side-effect, all invalid or unparsable frames are deleted: what remains in the output file will be a syntactically correct MP3 file. Usage: silencer infile.mp3 --strip outfile.mp3 For debugging purposes, this program can also generate a PBM file that graphically displays a histogram of volume and average volume of the input MP3 file; also marked in this image are the sections of the input file that have been (or would have been) deleted. I wrote this program because the time ranges covered by the DNA Lounge audio archives are driven by the hours of operation listed on the calendar; so if an event starts late, or if it ends early, then silence slips into the files. This wouldn't be a big deal if people were downloading these as MP3 files, but since they are streamed, and there's no way to fast forward or rewind, it's a pain to have to wait through half an hour of silence before the music starts! This program requires libmad (MPEG Audio Decoder library) which you can find at http://www.mars.org/home/rob/proj/mpeg/ Verbosity levels: 0: print errors only. 1: print files written. 2: print info clipped silent runs as they occur. 3: print more info about data read. print percent done every few seconds. 4: print all mp3 frame errors. print files read. print frames written. 5: print frame buffer activity. 6: print *all* frame buffering activity -- very loud. 7: print every mp3 header -- incredibly loud. 8: print info on every read() call -- deafening. */ #include #include #include #include #include #include #include #include #include #include #include #include const char *cvs_id = "$Revision: 1.22 $"; char *progname; static char *blurb; #define SILENCE_VOLUME 28 /* on a scale of 0-255: what volume counts as silent. */ /* 22-May-2006: 24 -> 28 for Mac */ #define SILENCE_QUANTUM 2 /* seconds: size of the sliding window over which volume is averaged. This is our smallest-chunk-of-time unit. */ #define SILENCE_SECONDS 60 /* seconds: how long a run of silence must continue before we delete it. */ #define SILENCE_SLACK 10 /* seconds: when we delete a run of silence, how much of a buffer should we leave behind on either side. */ #define EARLY_SECONDS (60*30) /* What counts as "early" in the file. When we strip out silence that appears "early" in the file, we pull the file's creation time forward, instead of pulling its "close" time backward. */ #define READ_BUFFER_SIZE 409600 #define ALARM_PERIOD 10 static int timer_tick = 0; typedef struct mp3_data mp3_data; typedef struct averager averager; typedef struct silence_data silence_data; typedef struct frame_buffer frame_buffer; struct silence_data { int silence_volume; /* low pass filter */ int silence_quantum; /* range over which silence is checked */ int silence_seconds; /* how long below `volume' until we signal it */ int silence_slack; /* how much to leave behind, when deleting */ int early_seconds; /* how long counts as "early" */ int silence_frames; /* silence_seconds expressed as frames */ int silence_quantum_frames; /* range, in frames */ int silence_slack_frames; /* slack, in frames */ int early_frames; /* early, in frames */ int silence_start_frame; int silence_start_byte; int silence_slack_frame; int silence_slack_byte; int triggered_p; averager *avg; }; struct frame_buffer { unsigned char *buf; int buf_size; /* in bytes */ int max_frames; /* max frames the buffer can hold */ int nframes; /* number of frames currently in buffer */ int last_frame; /* number of the last frame in the buffer */ unsigned short *frame_lengths; }; #define HIST_UNKNOWN 0 /* values used in the shistogram table */ #define HIST_LOUD_WRITTEN 1 #define HIST_SILENT_DELETED 2 #define HIST_SILENT_WRITTEN 3 #define HIST_SILENT_BUFFERED 4 static const char *shistogram_desc[] = { /* for error messages */ "unknown", "loud written", "silent deleted", "silent written", "silent buffered" }; struct mp3_data { int verbose_p; struct mad_stream *stream; unsigned char *buf; /* input file data buffer */ int buf_size; int buf_fp; int fd; /* input file */ int file_size; /* input file size in bytes */ int frame_size; /* approximate size of frames in this file */ int file_frames; /* assumed input file size in frames */ int bytes_read; int seconds_per_frame; /* duration of frames in this file */ int fractions_per_frame; int frame_count; /* frame number currently being processed */ int elapsed_seconds; /* elapsed time at current frame */ int elapsed_fraction; int bad_frame_count; /* how many corrupted frames we read */ int max_bad_frame_count; /* how many to tolerate */ int eof_p; /* whether we've read end-of-file */ /* > 0 -- normal eof; < 0 -- error eof. */ unsigned char *vhistogram; /* volume of each frame */ unsigned char *ahistogram; /* average of previous frames */ unsigned char *shistogram; /* which frames are marked for deletion */ int histogram_size; const char *mp3_outfile; /* file to which we're writing stripped data */ int mp3_out; int frames_written; /* frames currently in the output file */ int bytes_written; /* bytes currently in the output file */ int frames_omitted; /* How many total frames we discarded */ int frames_omitted_early; /* How many frames we discarded from near the beginning of the file */ silence_data silence_data; frame_buffer frame_buffer; }; static void first_frame_initialization (mp3_data *, struct mad_header const *); static void process_silence (mp3_data *); static void write_frame (mp3_data *, int loud_p); static void buffer_frame (mp3_data *); static void flush_frames (mp3_data *, int write_p); static void truncate_file (mp3_data *data, int byte, int frame); static void shistogram_store (mp3_data *data, const char *whence, int frame, int expect, int new); static void count_omitted (mp3_data *data, int omitted_frames); static void adjust_for_early_silence (const char *filename, mp3_data *data); /************************************************************************** + Keeping track of a running average of sequence of numbers * (so we can tell when it's been quiet for N seconds.) **************************************************************************/ struct averager { unsigned char *stack; int stack_size; int stack_fp; int total; }; static averager * make_averager (int size) { averager *a = (averager *) calloc (1, sizeof(*a)); a->stack_size = size; a->stack = (unsigned char *) calloc (1, a->stack_size); return a; } int averager_value (averager *a) { return a->total / a->stack_size; } int averager_push (averager *a, unsigned char value) { int pop = a->stack_fp + 1; int push = a->stack_fp; int old; if (pop >= a->stack_size) pop = 0; old = a->stack[pop]; a->stack[pop] = 0; a->stack[push] = value; a->stack_fp++; if (a->stack_fp >= a->stack_size) a->stack_fp = 0; a->total += (value - old); return a->total / a->stack_size; } /************************************************************************** + Callbacks for iterating over the MPEG frames **************************************************************************/ /* Callback that reads data and feeds it into MAD. */ static enum mad_flow input_cb (void *closure, struct mad_stream *stream) { mp3_data *data = (mp3_data *) closure; int len; if (data->eof_p > 0) return MAD_FLOW_STOP; /* stop decoding (no error) */ else if (data->eof_p < 0) return MAD_FLOW_BREAK; /* stop decoding (with an error) */ data->stream = stream; if (stream->next_frame) { /* next_frame points to the index in buf which has been consumed: it is the end of the frame we just processed. So copy that data down to the beginning of the buffer, so we can add more after it. */ data->buf_fp = (data->buf + data->buf_fp) - stream->next_frame; memmove (data->buf, stream->next_frame, data->buf_fp); } if (timer_tick && data->verbose_p > 2) { timer_tick = 0; fprintf (stdout, "%s: (processed %d:%02d:%02d -- %d%%)\n", blurb, data->elapsed_seconds / (60 * 60), (data->elapsed_seconds / 60) % 60, data->elapsed_seconds % 60, data->bytes_read / (data->file_size / 100)); fflush (stdout); alarm (ALARM_PERIOD); } do { len = read (data->fd, data->buf + data->buf_fp, data->buf_size - data->buf_fp); } while (len == -1 && errno == EINTR); if (len == -1) { return MAD_FLOW_BREAK; /* stop decoding (with an error) */ } data->bytes_read += len; if (data->verbose_p > 7) fprintf (stderr, "%s: read %d bytes\n", blurb, len); if (data->bytes_read > data->file_size) { fprintf (stderr, "%s: WARNING: file grew while we were reading! Bailing!\n", blurb); data->eof_p = -1; return MAD_FLOW_BREAK; /* stop decoding (with an error) */ } if (len > 0) { mad_stream_buffer (stream, data->buf, data->buf_fp += len); return MAD_FLOW_CONTINUE; /* continue decoding normally */ } else if (len < 0) { data->eof_p = -1; return MAD_FLOW_BREAK; /* stop decoding (with an error) */ } else { if (data->eof_p == 0) data->eof_p = 1; return MAD_FLOW_STOP; /* stop decoding (no error) */ } } /* Called for each MPEG frame: examines the parsed frame header. */ static enum mad_flow header_cb (void *closure, struct mad_header const *header) { mp3_data *data = (mp3_data *) closure; int bailing_p = 0; data->frame_count++; if (data->frame_count > data->file_frames + 100) bailing_p = 1; data->elapsed_seconds += header->duration.seconds; data->elapsed_fraction += header->duration.fraction; if (data->elapsed_fraction > MAD_TIMER_RESOLUTION) { data->elapsed_seconds += data->elapsed_fraction / MAD_TIMER_RESOLUTION; data->elapsed_fraction %= MAD_TIMER_RESOLUTION; } if (data->verbose_p > 6 || (data->verbose_p == 3 && data->frame_size == 0)) { fprintf (stderr, "%s: ", blurb); if (data->verbose_p > 6) fprintf (stderr, "frame %4d: %d:%02d:%02d: ", data->frame_count, data->elapsed_seconds / (60 * 60), (data->elapsed_seconds / 60) % 60, data->elapsed_seconds % 60); switch (header->layer) { case MAD_LAYER_I: fprintf (stderr, "Layer I"); break; case MAD_LAYER_II: fprintf (stderr, "Layer II"); break; case MAD_LAYER_III: fprintf (stderr, "Layer III"); break; default: fprintf (stderr, "Unknown Layer"); break; } fprintf (stderr, " "); switch (header->mode) { case MAD_MODE_SINGLE_CHANNEL: fprintf (stderr, "1ch"); break; case MAD_MODE_DUAL_CHANNEL: fprintf (stderr, "2ch"); break; case MAD_MODE_JOINT_STEREO: fprintf (stderr, "joint stereo"); break; case MAD_MODE_STEREO: fprintf (stderr, "normal stereo"); break; default: fprintf (stderr, "unknown mode"); break; } fprintf (stderr, "; "); switch (header->emphasis) { case MAD_EMPHASIS_NONE: break; case MAD_EMPHASIS_50_15_US: fprintf (stderr, " 50/15 emp; "); break; case MAD_EMPHASIS_CCITT_J_17: fprintf (stderr, "J.17 emp; "); break; default: fprintf (stderr, " unknown emphasis; "); break; } fprintf (stderr, "%dK %.1fKHz", (int)(header->bitrate/1000), (header->samplerate/1000.0)); fprintf (stderr, "\n"); } if (bailing_p) { int pct_done = data->bytes_read / (data->file_size / 100); int unread_bytes = data->file_size - data->bytes_read; int unread_frames = data->file_frames - data->frame_count; /* Since file_frames is only a guess, this warning is often spurious. */ if (unread_bytes != 0) { fprintf (stderr, "%s: WARNING: expected only %d frames in file!\n", blurb, data->file_frames); fprintf (stderr, "%s: WARNING: stop at %d%%; %d bytes %d frames left.\n", blurb, pct_done, unread_bytes, unread_frames); } if (data->eof_p == 0) data->eof_p = 1; return MAD_FLOW_STOP; /* stop decoding (no error) */ } if (data->frame_size == 0) first_frame_initialization (data, header); else { int frame_size = data->stream->next_frame - data->stream->this_frame; if (frame_size < (data->frame_size - 2) || frame_size > (data->frame_size + 2)) { /* This is not a bad frame according to the MPEG spec, but it is a bad frame for us, because the frame size has changed wildly. Odds are this is a corrupted frame that just happens to be legal. (Or a VBR file, which is not supported.) */ data->bad_frame_count++; count_omitted (data, 1); bailing_p = (data->bad_frame_count > data->max_bad_frame_count); if (bailing_p || data->verbose_p > 3) { fprintf (stderr, "%s: frame %4d: %d:%02d:%02d: ", blurb, data->frame_count, data->elapsed_seconds / (60 * 60), (data->elapsed_seconds / 60) % 60, data->elapsed_seconds % 60); fprintf (stderr, "large frame size change: %d to %d!\n", data->frame_size, frame_size); } if (bailing_p) { fprintf (stderr, "%s: WARNING: %d+ bad frames! Bailing!\n", blurb, data->bad_frame_count); data->eof_p = -1; return MAD_FLOW_BREAK; /* stop decoding (with an error) */ } if (data->frame_count < 10) { if (data->verbose_p > 3) fprintf (stderr, "%s: discarding frames read so far.\n", blurb); first_frame_initialization (data, header); } else { /* Too far into the stream. Discard this frame. */ return MAD_FLOW_IGNORE; /* skip decoding of this frame */ } } } return MAD_FLOW_CONTINUE; /* continue decoding normally */ } /* Called to examine or modify the decoded frame subband samples. */ static enum mad_flow filter_cb (void *closure, struct mad_stream const *stream, struct mad_frame *frame) { mp3_data *data = (mp3_data *) closure; /* After the frame has been decoded, the subband samples are in struct mad_frame, as frame.sbsample[channel][time][subband]. There are 32 subbands covering the entire frequency spectrum (think frequency domain). Each frame contains up to 36 time slices (think time domain) -- use MAD_NSBSAMPLES(&frame.header) to get the actual number in the current frame. Finally, there are potentially 2 channels -- use MAD_NCHANNELS(&frame.header) to be certain. During synthesis, each time slice generates 32 PCM samples, so generally 36 * 32 == 1152 PCM samples per frame. Since we're only looking for a measure of how silent each frame is, we can do this as easily in the frequency domain as in the time domain, so we can avoid the expensive synthesis step. Sample format: these are mad_fixed_t, which is MAD's fixed-point format; details in libmad/fixed.h. Full scale (0 dB) would be +/- 1.0, so generally samples fall between -1.0 and +1.0 (also known as -MAD_F_ONE and MAD_F_ONE.) */ int channel, slice, band; unsigned long avg = 0; int denom; for (channel = 0; channel < MAD_NCHANNELS(&frame->header); channel++) for (slice = 0; slice < MAD_NSBSAMPLES(&frame->header); slice++) for (band = 0; band < 32; band++) { mad_fixed_t fixed = frame->sbsample[channel][slice][band]; /* Fixed point values cover the range [-8.0, 8.0). Convert this to the int range [-1024, 1023] meaning that [-1.0, 1.0) ==> [-128, 127]. */ int sample = fixed >> (MAD_F_FRACBITS - 7); if (sample < 0) sample = -sample; /* abs */ avg += sample; } denom = (MAD_NCHANNELS(&frame->header) * /* number of samples in frame */ MAD_NSBSAMPLES(&frame->header) * 32); denom >>= 6; /* imperical fudge factor */ avg /= denom; if (avg > 255) avg = 255; if (data->vhistogram) data->vhistogram[data->frame_count] = (unsigned char) avg; /* Examine the histogram to decide if we're in a silent portion. */ process_silence (data); /* This means "don't do PCM synthesis", which is the expensive part. */ return MAD_FLOW_IGNORE; /* skip decoding the rest of the current frame */ } /* Called to do something with the synthesized output PCM samples, if the filter callback said that we are doing PCM synthesis. */ static enum mad_flow output_cb (void *closure, struct mad_header const *header, struct mad_pcm *pcm) { /* This is never called, since we aren't doing PCM synthesis. */ abort(); return MAD_FLOW_BREAK; } /* Called to handle decoder errors. */ static enum mad_flow error_cb (void *closure, struct mad_stream *ms, struct mad_frame *frame) { mp3_data *data = (mp3_data *) closure; const char *err; int bailing_p = 0; data->bad_frame_count++; count_omitted (data, 1); if (data->bad_frame_count > data->max_bad_frame_count) bailing_p = 1; if (bailing_p || data->verbose_p > 3) fprintf (stderr, "%s: frame %4d: %d:%02d:%02d: ", blurb, data->frame_count, data->elapsed_seconds / (60 * 60), (data->elapsed_seconds / 60) % 60, data->elapsed_seconds % 60); if (bailing_p) { fprintf (stderr, "%s: WARNING: %d+ bad frames! Bailing!\n", blurb, data->bad_frame_count-1); data->eof_p = -1; return MAD_FLOW_BREAK; /* stop decoding (with an error) */ } switch (ms->error) { case MAD_ERROR_BUFLEN: err = "input buffer too small (or EOF)"; break; case MAD_ERROR_BUFPTR: err = "invalid (null) buffer pointer"; break; case MAD_ERROR_NOMEM: err = "not enough memory"; break; case MAD_ERROR_LOSTSYNC: err = "lost synchronization"; break; case MAD_ERROR_BADLAYER: err = "reserved header layer value"; break; case MAD_ERROR_BADBITRATE: err = "forbidden bitrate value"; break; case MAD_ERROR_BADSAMPLERATE: err = "reserved frequency value"; break; case MAD_ERROR_BADEMPHASIS: err = "reserved emphasis value"; break; case MAD_ERROR_BADCRC: err = "CRC check failed"; break; case MAD_ERROR_BADBITALLOC: err = "forbidden bit allocation value"; break; case MAD_ERROR_BADSCALEFACTOR: err = "bad scalefactor index"; break; case MAD_ERROR_BADFRAMELEN: err = "bad frame length"; break; case MAD_ERROR_BADBIGVALUES: err = "bad big_values count"; break; case MAD_ERROR_BADBLOCKTYPE: err = "reserved block_type"; break; case MAD_ERROR_BADSCFSI: err = "bad scalefactor selection info"; break; case MAD_ERROR_BADDATAPTR: err = "bad main_data_begin pointer"; break; case MAD_ERROR_BADPART3LEN: err = "bad audio data length"; break; case MAD_ERROR_BADHUFFTABLE: err = "bad Huffman table select"; break; case MAD_ERROR_BADHUFFDATA: err = "Huffman data overrun"; break; case MAD_ERROR_BADSTEREO: err = "incompatible block_type for JS"; break; default: err = "UNKNOWN ERROR"; break; } switch (ms->error) { case MAD_ERROR_LOSTSYNC: if (!strncmp((const char *) ms->this_frame, "ID3", 3)) { if (data->verbose_p > 4) fprintf (stderr, "skipping ID3 frame\n"); return MAD_FLOW_IGNORE; /* skip the rest of the current frame */ } else if (strncmp((const char *) ms->this_frame, "TAG", 3) == 0) { if (data->verbose_p > 4) fprintf (stderr, "skipping ID3v1 frame\n"); mad_stream_skip (ms, 128); return MAD_FLOW_CONTINUE; /* continue decoding normally */ } /* else fall through */ case MAD_ERROR_BADCRC: if (data->verbose_p > 3) fprintf (stderr, "error: %s\n", err); return MAD_FLOW_IGNORE; /* skip the rest of the current frame */ default: if (data->verbose_p > 3) fprintf (stderr, "error: %s\n", err); return MAD_FLOW_CONTINUE; /* continue decoding normally */ } } static void first_frame_initialization (mp3_data *data, struct mad_header const *header) { silence_data *sd = &data->silence_data; frame_buffer *fb = &data->frame_buffer; float secs_per_frame; /* Guess that all frames in the file are about the size of the first one. But they DO vary! 128K files have either 417 or 418 byte frames -- frame length = 144*bitrate / sample rate + (padding?1:0)... */ data->frame_size = data->stream->next_frame - data->stream->this_frame; data->seconds_per_frame = header->duration.seconds; data->fractions_per_frame = header->duration.fraction; data->file_frames = data->file_size / data->frame_size; data->histogram_size = data->file_frames + 1; /* Round up to a multiple of 8 (for PBM-generation niceness.) */ data->histogram_size = ((data->histogram_size + 8) / 8) * 8; if (data->vhistogram) free (data->vhistogram); if (data->ahistogram) free (data->ahistogram); if (data->shistogram) free (data->shistogram); data->vhistogram = (unsigned char *) calloc (1, data->histogram_size + 200); data->ahistogram = (unsigned char *) calloc (1, data->histogram_size + 200); data->shistogram = (unsigned char *) calloc (1, data->histogram_size + 200); secs_per_frame = (data->seconds_per_frame + (data->fractions_per_frame / (float)MAD_TIMER_RESOLUTION)); /* Initialize the silence detector */ sd->silence_volume = SILENCE_VOLUME; sd->silence_seconds = SILENCE_SECONDS; sd->silence_quantum = SILENCE_QUANTUM; sd->silence_slack = SILENCE_SLACK; sd->early_seconds = EARLY_SECONDS; sd->silence_frames = sd->silence_seconds / secs_per_frame; sd->silence_quantum_frames = sd->silence_quantum / secs_per_frame; sd->silence_slack_frames = sd->silence_slack / secs_per_frame; sd->early_frames = sd->early_seconds / secs_per_frame; sd->silence_start_frame = -1; sd->silence_start_byte = -1; sd->silence_slack_frame = -1; sd->silence_slack_byte = -1; sd->triggered_p = 0; sd->avg = make_averager (sd->silence_quantum_frames); if (data->verbose_p > 2) { fprintf (stderr, "%s: searching for %d+ sec silence (%d sec slack)\n", blurb, sd->silence_seconds, sd->silence_slack); } if (sd->silence_seconds == 0 || sd->silence_seconds < ((sd->silence_quantum * 2) + 1) || sd->silence_seconds < ((sd->silence_slack * 2) + 1) || sd->silence_slack < sd->silence_quantum) { fprintf (stderr, "%s: silence params out of bounds: sec: %d; quantum: %d; slack %d\n", blurb, sd->silence_seconds, sd->silence_quantum, sd->silence_slack); exit (2); } /* Initialize the MP3 file writer (the frame buffer.) */ fb->max_frames = sd->silence_slack_frames; fb->buf_size = ((fb->max_frames + 2) * (data->frame_size + 4)); if (fb->buf) free (fb->buf); fb->buf = (unsigned char *) calloc (1, fb->buf_size); if (fb->frame_lengths) free (fb->frame_lengths); fb->frame_lengths = (unsigned short *) calloc (sizeof(*fb->frame_lengths), (fb->max_frames + 2)); if (!fb->buf || !fb->frame_lengths) { fprintf (stderr, "%s: %s: out of memory (allocating %d output buffer)\n", progname, data->mp3_outfile, fb->buf_size); exit (1); } } /************************************************************************** + Detection of silence, and output. **************************************************************************/ static void process_silence (mp3_data *data) { silence_data *sd = &data->silence_data; frame_buffer *fb = &data->frame_buffer; int i = data->frame_count; int b = data->bytes_written; int volume = (data->eof_p ? 0 : data->vhistogram[i]); int silent_p; int running_average; if (i < 0) abort(); if (data->shistogram[i] != HIST_UNKNOWN && data->verbose_p > 4) fprintf (stderr, "%s: new frame %d already marked? (%d)\n", blurb, i, data->shistogram[i]); running_average = averager_push (sd->avg, volume); silent_p = (running_average <= sd->silence_volume); if (data->eof_p) /* close off a trailing silent block at EOF */ silent_p = 0; /* to graph the running average instead of the volume in the PBM */ data->ahistogram[i] = running_average; if (silent_p) { /* We have entered (or are already in) a block of silence: the average of the previous N blocks is below the volume threshold. (But this block might not qualify as being long enough just yet.) */ if (!sd->triggered_p && (sd->silence_start_frame == -1 || i < sd->silence_start_frame + sd->silence_frames)) { /* We're in a silent passage, but it is not yet long enough to qualify qualify for being stripped. Frames like this get written. */ write_frame (data, 0); if (sd->silence_start_frame == -1) { /* This is the first frame in a consecutive run of 1 or more silent frames: remember it. */ sd->silence_start_frame = i; sd->silence_start_byte = b; /* Kludge for silence at beginning-of-file -- if silence started at the beginning of the file, then set slack there as well, because we know there was no fade-out before this block of silence (the file begins silent.) */ if (sd->silence_start_byte == 0) { sd->silence_slack_frame = sd->silence_start_frame; sd->silence_slack_byte = sd->silence_start_byte; } } /* If we've just passed the leading slack threshold, remember it. */ if (sd->silence_slack_frame == -1 && i >= sd->silence_start_frame + sd->silence_slack_frames) { sd->silence_slack_frame = i; sd->silence_slack_byte = b; } } else if (!sd->triggered_p) { /* We're in a silent passage, and it just became long enough to qualify for being stripped. We want to strip it all the way back to where it began being silent, plus some slack. (We don't strip to *this* point: this point is N minutes into the silence.) */ int f; sd->triggered_p = i; truncate_file (data, sd->silence_slack_byte, (data->frames_written - (i - sd->silence_slack_frame))); /* Mark the frames we just threw away as having been silent. */ for (f = sd->silence_slack_frame; f <= i; f++) shistogram_store (data, "trunc", f, (f == i ? HIST_UNKNOWN : HIST_SILENT_WRITTEN), HIST_SILENT_DELETED); if (sd->silence_start_frame == -1) abort(); } else { /* We're in the midst of a long silent passage. Instead of writing these frames, buffer them. We'll need the last few frames in the buffer when we reach the end of the silence. */ buffer_frame (data); if (sd->silence_start_frame == -1) abort(); } } else { /* We are not in a block of silence: the average of the previous N blocks is above the volume threshold. */ if (sd->triggered_p) { /* We had been in a long block of silence, so this is the end of it. The frame buffer contains the last `slack' frames of silence. We write those to the file, because although they average to below the threshold, they might contain a fade-in. Except, don't write them if we're at EOF now, since we know that no fade-in is coming. */ if (data->eof_p) flush_frames (data, 0); if (data->verbose_p > 1) { float secs_per_frame = (data->seconds_per_frame + (data->fractions_per_frame / (float)MAD_TIMER_RESOLUTION)); int start0 = sd->silence_start_frame; int end0 = i; int start1 = sd->silence_slack_frame; int end1 = end0 - fb->nframes; int start0_secs, end0_secs, dur0_secs; int start1_secs, end1_secs, dur1_secs; if (end0 < start0 || end0 < start1 || end1 < start0 || end1 < start1 || start1 < start0 || end0 < end1) abort(); start0_secs = secs_per_frame * start0; end0_secs = secs_per_frame * end0; dur0_secs = end0_secs - start0_secs; start1_secs = secs_per_frame * start1; end1_secs = secs_per_frame * end1; dur1_secs = end1_secs - start1_secs; fprintf (stderr, "\n" "%s: silence: " "%d:%02d:%02d - %d:%02d:%02d (%d:%02d:%02d) " "%5d-%d (%d)\n", blurb, start0_secs / (60 * 60), (start0_secs / 60) % 60, start0_secs % 60, end0_secs / (60 * 60), (end0_secs / 60) % 60, end0_secs % 60, dur0_secs / (60 * 60), (dur0_secs / 60) % 60, dur0_secs % 60, start0, end0, end0-start0); fprintf (stderr, "%s: clipped: " "%d:%02d:%02d - %d:%02d:%02d (%d:%02d:%02d) " "%5d-%d (%d)\n" "\n", blurb, start1_secs / (60 * 60), (start1_secs / 60) % 60, start1_secs % 60, end1_secs / (60 * 60), (end1_secs / 60) % 60, end1_secs % 60, dur1_secs / (60 * 60), (dur1_secs / 60) % 60, dur1_secs % 60, start1, end1, end1-start1); } } /* this frame isn't silent, so write it out, along with any silent-but-not-long-enough frames that are in the buffer. This includes any fade-in that might follow a long silence. */ write_frame (data, 1); sd->silence_start_frame = -1; sd->silence_start_byte = -1; sd->silence_slack_frame = -1; sd->silence_slack_byte = -1; sd->triggered_p = 0; } if (data->shistogram[i] == HIST_UNKNOWN && data->verbose_p > 4) fprintf (stderr, "%s: new frame %d wasn't marked? (%d)\n", blurb, i, data->shistogram[i]); } static void write_all (int fd, const unsigned char *buf, size_t count) { while (count > 0) { int n = write (fd, buf, count); if (n < 0) { char buf2[1024]; if (errno == EINTR || errno == EAGAIN) continue; sprintf (buf2, "%.255s: write:", blurb); perror (buf2); exit (1); } count -= n; buf += n; } } /* Write the current frame to the output file (after also writing out any buffered frames.) */ static void write_frame (mp3_data *data, int loud_p) { flush_frames (data, 1); if (data->mp3_out && !data->eof_p) { /* if eof_p, data->stream has already been freed... I think... */ const unsigned char *mbuf = data->stream->this_frame; int length = data->stream->next_frame - mbuf; write_all (data->mp3_out, mbuf, length); data->bytes_written += length; } /* Mark this frame as having been written. */ shistogram_store (data, "write_frame", data->frame_count, HIST_UNKNOWN, (loud_p ? HIST_LOUD_WRITTEN : HIST_SILENT_WRITTEN)); data->frames_written++; } static void truncate_file (mp3_data *data, int byte, int frame) { if (!data->mp3_out) return; if (ftruncate (data->mp3_out, byte) || lseek (data->mp3_out, byte, SEEK_SET) < 0) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, data->mp3_outfile); perror (buf); exit (1); } if (data->verbose_p > 4) fprintf (stderr, "%s: %s: truncate to %d (from %d: delete %d)\n", progname, data->mp3_outfile, byte, data->bytes_written, data->bytes_written - byte); count_omitted (data, data->frames_written - frame); data->bytes_written = byte; data->frames_written = frame; } static void count_omitted (mp3_data *data, int omitted_frames) { silence_data *sd = &data->silence_data; data->frames_omitted += omitted_frames; if (data->frames_written <= sd->early_frames) data->frames_omitted_early += omitted_frames; } /* Copy a frame into the output buffer (so we can decide later whether to write it.) This is a circular buffer: only up to 2 * slack frames are preserved in the buffer; others expire out as new ones are added. */ static void buffer_frame (mp3_data *data) { frame_buffer *fb = &data->frame_buffer; const unsigned char *frame = data->stream->this_frame; int length = data->stream->next_frame - frame; int end = 0; int i; for (i = 0; i < fb->nframes; i++) end += fb->frame_lengths[i]; count_omitted (data, 1); if (end >= fb->buf_size) abort(); if (fb->nframes > fb->max_frames) abort(); if (fb->nframes == fb->max_frames) { /* The buffer is full. Remove the oldest frame from the buffer to make room for the new one. */ int last_in_buffer = fb->last_frame; int first_in_buffer = last_in_buffer - (fb->nframes - 1); shistogram_store (data, "buffer_frame 1", first_in_buffer, HIST_SILENT_BUFFERED, HIST_SILENT_DELETED); end -= fb->frame_lengths[0]; if (end <= 0) abort(); memmove (fb->buf, fb->buf + fb->frame_lengths[0], end); memmove (fb->frame_lengths, fb->frame_lengths + 1, (fb->nframes - 1) * sizeof(*fb->frame_lengths)); fb->nframes--; if (fb->nframes < 0) abort(); } /* Add the new frame to the end. */ if (end + length >= fb->buf_size) abort(); memmove (fb->buf + end, frame, length); fb->frame_lengths[fb->nframes] = length; fb->nframes++; fb->last_frame = data->frame_count; shistogram_store (data, "buffer_frame 2", data->frame_count, HIST_UNKNOWN, HIST_SILENT_BUFFERED); if (data->verbose_p > 5) fprintf (stderr, "%s: buffer frame %d (#%d, L=%d)\n", blurb, fb->last_frame, fb->nframes, length); } /* Empty the output buffer, either by writing the frames to the output file, or by discarding them. */ static void flush_frames (mp3_data *data, int write_p) { frame_buffer *fb = &data->frame_buffer; int last_in_buffer = fb->last_frame; int first_in_buffer = last_in_buffer - (fb->nframes - 1); int i; if (write_p && data->mp3_out && fb->nframes) { int end = 0; for (i = 0; i < fb->nframes; i++) end += fb->frame_lengths[i]; count_omitted (data, -fb->nframes); write_all (data->mp3_out, fb->buf, end); data->frames_written += fb->nframes; data->bytes_written += end; if (data->verbose_p > 4) fprintf (stderr, "%s: wrote frames %d - %d (%d)\n", blurb, first_in_buffer, last_in_buffer, fb->nframes); } else if (data->verbose_p > 4 && fb->nframes) fprintf (stderr, "%s: discarded buffer -- frames %d - %d (%d)\n", blurb, first_in_buffer, last_in_buffer, fb->nframes); /* Mark all these frames as having been written, or tossed. */ for (i = first_in_buffer; i <= last_in_buffer; i++) shistogram_store (data, "flush-frames", i, (write_p ? HIST_SILENT_BUFFERED : HIST_SILENT_DELETED), (write_p ? HIST_SILENT_WRITTEN : HIST_SILENT_DELETED)); /* Now reset the buffer. */ fb->nframes = 0; fb->last_frame = -1; } static void shistogram_store (mp3_data *data, const char *whence, int frame, int expect, int new) { int old = data->shistogram[frame]; if (old != expect && data->verbose_p > 4) { fprintf (stderr, "%s: %s: %s frame %d already %s?\n", blurb, whence, shistogram_desc[new], frame, shistogram_desc[old]); } data->shistogram[frame] = new; } /* Write an output PBM volume histogram file. */ static void generate_pbm (mp3_data *data, const char *filename) { FILE *f; int pbm_scale = 8; int width = data->histogram_size / pbm_scale; int height = 256; int x8, w8; unsigned char *pbm_data; int last_hh = 0; int last_mm = 0; int last_mm2 = 0; int last_ss5 = 0; float secs = 0; float secs_per_frame = (data->seconds_per_frame + (data->fractions_per_frame / (float)MAD_TIMER_RESOLUTION)); /* Round up to a multiple of 8, for PBM. */ width = ((width + 8) / 8) * 8; w8 = width >> 3; f = fopen (filename, "w"); if (!f) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, filename); perror (buf); exit (1); } pbm_data = (unsigned char *) calloc (w8, height); if (!pbm_data) { fprintf (stderr, "%s: %s: out of memory (%d x %d PBM)\n", progname, filename, width, height); exit (1); } for (x8 = 0; x8 < w8; x8++) { int x; for (x = 0; x < 8; x++) { int i, y; unsigned long vol = 0; unsigned long avg = 0; int hh, mm, mm2, ss5; int tick; int bar_bot = 128; int bar_top = 160; int state = HIST_LOUD_WRITTEN; int color; for (i = 0; i < pbm_scale; i++) { int ii = (x8 * 8 * pbm_scale) + (x + i); vol += data->vhistogram[ii]; avg += data->ahistogram[ii]; secs += secs_per_frame; state = (state == HIST_LOUD_WRITTEN ? data->shistogram[ii] : state); } vol /= pbm_scale; avg /= pbm_scale; /* Note when the clock has ticked */ hh = (int)secs / (60 * 60); mm = ((int)secs / 60) % 60; mm2 = ((int)secs / 30) % 60; ss5 = ((int)secs) % (60/5); /* If it has, draw a tick mark at the top of the image. */ if (hh != last_hh) tick = 128; else if (mm != last_mm) tick = 40; else if (mm2 != last_mm2) tick = 20; else if (ss5 != last_ss5) tick = 10; else tick = 0; last_hh = hh; last_mm = mm; last_mm2 = mm2; last_ss5 = ss5; tick = 256 - tick; switch (state) { case HIST_LOUD_WRITTEN: color = 0; break; case HIST_SILENT_WRITTEN: color = 1; break; case HIST_SILENT_DELETED: color = 2; break; case HIST_SILENT_BUFFERED: color = 2; bar_top = 256; break; default: color = 2; bar_bot = 0; bar_top = 256; break; } for (y = 255; y >= 0; y--) { int store = (y <= vol); if (tick && y >= tick) /* clock marks */ store = !store; else if (y <= avg && !(y % 3) && !(((x8 << 3) | x) % 3) && !(y % 6) != !(((x8 << 3) | x) % 6)) /* overlay the average graph on the volume graph. */ store = !store; else if (color != 0 && y >= bar_bot && y < bar_top) { if (color == 2 ? y & 1 : (!(y % 3) && !(((x8 << 3) | x) % 3) && !(y % 6) != !(((x8 << 3) | x) % 6))) store = !store; } if (store) pbm_data [((height-y-1) * w8) + x8] |= (1 << (7-x)); } } } fprintf (f, "P4\n%d %d\n", width, height); if ((fwrite (pbm_data, 1, w8 * height, f) != w8 * height) || fclose (f)) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, filename); perror (buf); exit (1); } else fprintf (stderr, "%s: %s: wrote %d x %d histogram image\n", progname, filename, width, height); free (pbm_data); } /************************************************************************** + Initialization and main loop **************************************************************************/ static void sigalarm (int ignore) { timer_tick++; } static void print_time (FILE *out, int secs, int bytes, int frames, int bad_frames, float pct) { fprintf (out, "%d:%02d:%02d (", secs / (60 * 60), (secs / 60) % 60, (secs % 60)); if (bytes > 1024*1024) fprintf (stderr, "%.1fM", bytes / (1024 * 1024.0)); else if (bytes > 1024) fprintf (out, "%dK", bytes / 1024); else fprintf (out, "%d bytes", bytes); fprintf (out, ", %d frames", frames); if (bad_frames > 0) fprintf (out, ", %d bad", bad_frames); if (pct != -1) fprintf (out, ", %.1f%%", pct); fprintf (out, ")\n"); } static void process_stream (FILE *in, int verbose_p, const char *infile, const char *pbm_outfile, const char *mp3_outfile) { struct mad_decoder md; mp3_data D; mp3_data *data = &D; struct stat st; memset (&md, 0, sizeof(md)); memset (data, 0, sizeof(*data)); data->frame_count = -1; data->verbose_p = verbose_p; data->fd = fileno (in); data->buf_size = READ_BUFFER_SIZE; data->buf = (unsigned char *) malloc (data->buf_size); data->max_bad_frame_count = 200; if (!data->buf) { fprintf (stderr, "%s: out of memory (allocating %d buffer)\n", blurb, data->buf_size); exit (1); } if (!fstat (data->fd, &st)) data->file_size = st.st_size; else { fprintf (stderr, "%s: unable to stat input fd\n", blurb); exit (1); } /* #### */ /*data->file_size = 6000000;*/ if (mp3_outfile) { FILE *f; data->mp3_outfile = mp3_outfile; f = fopen (mp3_outfile, "w"); if (!f) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, mp3_outfile); perror (buf); exit (1); } else if (data->verbose_p > 3) fprintf (stderr, "%s: writing \"%s\"\n", progname, mp3_outfile); data->mp3_out = fileno (f); } if (data->verbose_p > 2) { signal (SIGALRM, sigalarm); alarm (ALARM_PERIOD); } mad_decoder_init (&md, (void *) data, input_cb, header_cb, filter_cb, output_cb, error_cb, 0); mad_decoder_run (&md, MAD_DECODER_MODE_SYNC); mad_decoder_finish (&md); process_silence (data); /* finish up at EOF */ if (data->verbose_p > 2) { fprintf (stderr, "%s: read ", blurb); print_time (stderr, data->elapsed_seconds, data->bytes_read, data->frame_count, data->bad_frame_count, -1); } if (data->verbose_p > 2) alarm (0); if (data->mp3_out) { if (close (data->mp3_out)) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, mp3_outfile); perror (buf); exit (1); } if (data->verbose_p > 0) { float secs_per_frame = (data->seconds_per_frame + (data->fractions_per_frame / (float)MAD_TIMER_RESOLUTION)); int secs = secs_per_frame * data->frames_written; int bytes = data->bytes_written; int frames = data->frames_written; fprintf (stderr, "%s: %s: wrote ", progname, mp3_outfile); print_time (stderr, secs, bytes, frames, -1, -1); secs = data->elapsed_seconds - secs; bytes = data->file_size - bytes; frames = data->frame_count - frames; fprintf (stderr, "%s: %s: deleted ", progname, mp3_outfile); print_time (stderr, secs, bytes, frames, -1, 100 - (data->bytes_written * 100.0 / data->file_size)); } if (data->frames_omitted_early) adjust_for_early_silence (infile, data); data->mp3_out = 0; } if (pbm_outfile) generate_pbm (data, pbm_outfile); /* Now that we've finished up, exit with an error if something went wrong. */ if (data->eof_p < 0) exit (data->eof_p); } /* If we have "early" silence (as per EARLY_SECONDS) and if a file XXX.time exists (corresponding to this XXX.mp3 file) then increment the write-date on that XXX.time file by the amount of the early silence. The idea here is that the date on XXX.mp3 is when the file stopped being written, but the date on XXX.time is when it started being written. If we're stripping out silence that was near the beginning of the file, we want to increase the start-time rather than decrementing the end-time. It's kind of bogus that this is happening in here, because it violates the division of work between silencer.c and clean-mp3s.pl -- in particular, if clean-mp3s.pl decides not to use the mp3 file we write for whatever reason, we've already modified the .time file, so clean-mp3s.pl has to un-do that... */ static void adjust_for_early_silence (const char *filename, mp3_data *data) { if (data->frames_omitted != data->frame_count - data->frames_written) { int n = (data->frames_omitted - (data->frame_count - data->frames_written)); if (data->verbose_p > 1 || n > 50 || n < -50) fprintf (stderr, "%s: WARNING: expected %d omitted frames, got %d!\n", blurb, data->frame_count - data->frames_written, data->frames_omitted); } if (data->frames_omitted_early == 0) return; else if (!filename || !*filename) return; else if (data->frames_omitted_early < 1000) { if (data->verbose_p > 1) fprintf (stderr, "%s: %d frames omitted early; ignoring them.\n", blurb, data->frames_omitted_early); } else { float secs_per_frame = (data->seconds_per_frame + (data->fractions_per_frame / (float)MAD_TIMER_RESOLUTION)); int secs = secs_per_frame * data->frames_omitted_early; char *time_file = (char *) malloc (strlen (filename) + 10); char *s; struct stat st; strcpy (time_file, filename); s = strrchr (time_file, '.'); if (s) *s = 0; strcat (time_file, ".time"); if (!stat (time_file, &st)) { struct utimbuf ut; ut.actime = st.st_atime; ut.modtime = st.st_mtime + secs; if (utime (time_file, &ut)) { char buf[255]; sprintf (buf, "%.100s: %.100s: utime", progname, time_file); perror (buf); } else if (data->verbose_p > 1) { struct tm *tm; int h1, m1, s1; int h2, m2, s2; tm = localtime (&st.st_mtime); h1 = tm->tm_hour; m1 = tm->tm_min; s1 = tm->tm_sec; tm = localtime (&ut.modtime); h2 = tm->tm_hour; m2 = tm->tm_min; s2 = tm->tm_sec; fprintf (stderr, "%s: %d frames omitted early (%d:%02d:%02d).\n", blurb, data->frames_omitted_early, secs / (60 * 60), (secs / 60) % 60, (secs % 60)); fprintf (stderr, "%s: adjusted date of %s from " "%d:%02d:%02d to %d:%02d:%02d.\n", progname, time_file, h1, m1, s1, h2, m2, s2); } } else { if (data->verbose_p > 1) fprintf (stderr, "%s: %s does not exist.\n", progname, time_file); } free (time_file); } } static void usage(void) { fprintf (stderr, "usage: %s [--verbose] input-file [--strip output.mp3]" " [--pbm output.pbm]\n", progname); exit (1); } int main (int argc, char **argv) { char *s; int i; char *filename = 0; char *ofilename = 0; FILE *f; int verbose_p = 0; char *pbm_outfile = 0; char *mp3_outfile = 0; progname = argv[0]; s = strrchr(progname, '/'); if (s) progname = s+1; blurb = progname; for (i = 1; i < argc; i++) { if (argv[i][0] == '-' && argv[i][1] == '-') argv[i]++; if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "-verbose")) verbose_p++; else if (!strcmp(argv[i], "-vv")) verbose_p += 2; else if (!strcmp(argv[i], "-vvv")) verbose_p += 3; else if (!strcmp(argv[i], "-vvvv")) verbose_p += 4; else if (!strcmp(argv[i], "-vvvvv")) verbose_p += 5; else if (!strcmp(argv[i], "-vvvvvv")) verbose_p += 6; else if (!strcmp(argv[i], "-vvvvvvv")) verbose_p += 7; else if (!strcmp(argv[i], "-vvvvvvvv")) verbose_p += 8; else if (!strcmp(argv[i], "-vvvvvvvvv")) verbose_p += 9; else if (!strcmp(argv[i], "-pbm")) { pbm_outfile = argv[++i]; if (pbm_outfile[0] == '-' || i >= argc) usage(); } else if (!strcmp(argv[i], "-strip")) { mp3_outfile = argv[++i]; if (mp3_outfile[0] == '-' || i >= argc) usage(); } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "-help") || !strcmp(argv[i], "-version")) { char *v = strdup (cvs_id); char *s = strchr (v, ' '); v = s+1; s = strchr (v, ' '); *s = 0; fprintf (stderr, "Silencer, version %s: strips silence from MP3 files.\n" "Copyright (c) 2001 Jamie Zawinski \n" "http://www.dnalounge.com/backstage/src/archiver/\n" "\n", v); usage(); } else if (argv[i][0] == '-' && argv[i][1]) usage(); else if (!filename) filename = argv[i]; else usage(); } if (!filename) usage(); if (*filename == '-') { f = stdin; filename = 0; blurb = (char *) malloc (strlen(progname) + 50); sprintf (blurb, "%s: ", progname); if (verbose_p > 3) fprintf (stderr, "%s: reading \n", progname); } else { ofilename = filename; f = fopen (filename, "r"); if (!f) { char buf[255]; sprintf(buf, "%.100s: %.100s", progname, filename); perror (buf); exit (1); } else if (verbose_p > 3) fprintf (stderr, "%s: reading \"%s\"\n", progname, filename); s = strrchr(filename, '/'); if (s) filename = s+1; blurb = (char *) malloc (strlen(progname) + strlen(filename) + 50); sprintf (blurb, "%s: %s", progname, filename); } process_stream (f, verbose_p, ofilename, pbm_outfile, mp3_outfile); if (filename) fclose (f); exit (0); }