/*
* CompressEngine.java
* compresses ala the Unix program compress
*
* Adapted from C code on January 3, 2008, 5:15 PM
* @author fred
*/
/*
* Copyright (c) 1985, 1986 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* James A. Woods, derived from original work by Spencer Thomas
* and Joseph Orost.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
package com.physpics.tools.encoding;
import java.io.*;
/** The compress
method reads an InputStream
and writes a compressed version to an OutputStream.
The compression algorithm is that of the Unix program
compress
. The output is suitable for HTTP
transmission with Content-Encoding: compress.
This code was transposed by Fred Hansen, January, 2008, from the original C:
* compress.c - File compression ala IEEE Computer, June 1984.
*
* Authors: Spencer W. Thomas (decvax!utah-cs!thomas)
* Jim McKie (decvax!mcvax!jim)
* Steve Davies (decvax!vax135!petsd!peora!srd)
* Ken Turkowski (decvax!decwrl!turtlevax!ken)
* James A. Woods (decvax!ihnp4!ames!jaw)
* Joe Orost (decvax!vax135!petsd!joe)
* Algorithm from "A Technique for High Performance Data Compression",
* Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19.
*/
public class CompressEngine {
/* MAXBITS is the maximum number of bits per code
in the C progam, it was limited by word size
Now longer would be possible.
But UncompressOutputStream will not handle larger.
HSIZE is the size of a hashtable.
It limits the value of MAXBITS because
HSIZE > 1.05 * 2^MAXBITS
and huge arrays impose resource constraints
we need not impose.
*/
static final int MAXBITS = 16; /* max bits/code */
static final int HSIZE = 69001; /* 95% occupancy */
int n_bits; /* number of bits/code */
int maxbits = MAXBITS; /* max # bits/code */
int maxcode; /* maximum code, given n_bits */
int maxmaxcode = 1 << MAXBITS; /* should NEVER generate this code */
static int MAXCODE(int n_bits) { return (1<compress method.
@param fsize The expected length of the file to be compressed.
*/
public void input_size(int fsize) {
/*
* tune hash table size for small files -- ad hoc,
* but the sizes match earlier #defines, which
* serve as upper bounds on the number of output codes.
*/
hsize = HSIZE;
if ( fsize < (1 << 12) )
hsize = 5003;
else if ( fsize < (1 << 13) )
hsize = 9001;
else if ( fsize < (1 << 14) )
hsize = 18013;
else if ( fsize < (1 << 15) )
hsize = 35023;
else if ( fsize < 47000 )
hsize = 50021;
}
/**
* compresses instr
to outstr
.
*
* Algorithm: use open addressing double hashing (no chaining) on the
* prefix code / next character combination. We do a variant of Knuth's
* algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
* secondary probe. Here, the modular division first probe gives way
* to a faster exclusive-or manipulation. Also do block compression with
* an adaptive reset, whereby the code table is cleared when the compression
* ratio decreases, but after the table fills. The variable-length output
* codes are re-sized at this point, and a special CLEAR code is generated
* for the decompressor. Late addition: construct the table according to
* file size for noticeable speed improvement on small files. Please direct
* questions about this implementation to ames!jaw.
@return true iff the output is fewer bytes than the input.
* The InputStream and OutputStream are NOT closed.
@throws IOException if reading or writing fails.
*/
public boolean compress() throws IOException {
int fcode;
int c, ent, disp, hshift;
outstr.write(magic_header[0]);
outstr.write(magic_header[1]);
outstr.write(maxbits | BLOCK_MASK); // we may do block compression
offset = 0;
bytes_out = 3; /* includes 3-byte header mojo */
out_count = 0;
clear_flg = false;
ratio = 0;
in_count = 1;
checkpoint = CHECK_GAP;
maxcode = MAXCODE(n_bits = INIT_BITS);
free_ent = FIRST;
ent = instr.read();
hshift = 0;
for ( fcode = hsize; fcode < 65536; fcode *= 2 )
hshift++;
hshift = 8 - hshift; /* set hash code range bound */
cl_hash(hsize); /* clear hash table */
mainloop:
while ( (c = instr.read()) != -1 ) {
in_count++;
fcode = (c << maxbits) + ent;
int i = (c << hshift) ^ ent; /* xor hashing */
if ( htab[i] == fcode ) {
ent = codetab[i];
continue;
}
else if ( htab[i] >= 0 ) { /* non-empty slot */
disp = hsize - i; /* secondary hash (after G. Knott) */
if ( i == 0 )
disp = 1;
do {
if ( (i -= disp) < 0 )
i += hsize;
if ( htab[i] == fcode ) {
ent = codetab[i];
continue mainloop;
}
} while (htab[i] > 0);
}
output ( ent );
out_count++;
ent = c;
if ( free_ent < maxmaxcode ) {
codetab[i] = free_ent++; /* code -> hashtable */
htab[i] = fcode;
}
else if ( in_count >= checkpoint )
cl_block ();
}
/*
* Put out the final code.
*/
output( ent );
out_count++;
output( -1 );
return bytes_out < in_count;
}
byte buf[] = new byte[MAXBITS]; // buffer for output method
// the buffer will hold 8*NBITS bits; eight codes occupy 8*NBITS bits
// so eight codes will fit in buf
static byte lmask[] = {(byte)0xff, (byte)0xfe, (byte)0xfc, (byte)0xf8,
(byte)0xf0, (byte)0xe0, (byte)0xc0, (byte)0x80, (byte)0x00};
static byte rmask[] = {(byte)0x00, (byte)0x01, (byte)0x03, (byte)0x07,
(byte)0x0f, (byte)0x1f, (byte)0x3f, (byte)0x7f, (byte)0xff};
/*-
* Output the given code.
* Inputs:
* code: A n_bits-bit integer. If == -1, then EOF. This assumes
* that n_bits <= wordsize - 1.
* Outputs:
* Outputs code to the file.
* Assumptions:
* Chars are 8 bits long.
* Algorithm:
* Insert successive codes in a buffer of length MAXBITS.
* (Eight successive output codes will fill it exactly.)
* When the buffer is full, empty it and start over.
*/
void output( int code ) throws IOException {
int r_off = offset, // the bit offset to the first empty bit
bits = n_bits, // bitss remaining to output from current code
bx; // index into buf
if ( code >= 0 ) { // (that is, if the code is not EOF)
// Set bx and r_off for the first byte.
bx = (r_off >> 3);
r_off &= 7;
// the code is 8 bits or more, so it exceeds a byte
// so we need only mask it on one side to fill out buf[bx]
// Sadly, we are emulating a vax, so the bits we choose are
// the LEAST significant digits of the code, the bits on the right.
buf[bx] = (byte)((buf[bx] & rmask[r_off]) | (code << r_off) & lmask[r_off]);
bx++;
bits -= (8 - r_off);
code >>= 8 - r_off;
// Stow any 8 bit parts in the middle (<=1 for up to 16 bits).
if ( bits >= 8 ) {
buf[bx++] = (byte)code;
code >>= 8;
bits -= 8;
}
/* Last bits. */
if(bits > 0)
buf[bx] = (byte)code;
offset += n_bits;
if ( offset == (n_bits << 3) ) {
outstr.write( buf, 0, n_bits );
bytes_out += n_bits;
offset = 0;
}
/*
* If the next entry is going to be too big for the code size,
* then increase it, if possible.
*/
if ( (free_ent > maxcode) || clear_flg ) {
/*
* Write the whole buffer, because the input side won't
* discover the size increase until after it has read it.
*/
if ( offset > 0 ) {
outstr.write( buf, 0, n_bits );
bytes_out += n_bits;
}
offset = 0;
if ( clear_flg ) {
maxcode = MAXCODE (n_bits = INIT_BITS);
clear_flg = false;
}
else {
n_bits++;
if ( n_bits == maxbits )
maxcode = maxmaxcode;
else
maxcode = MAXCODE(n_bits);
}
}
}
else { // code < 0, i.e., EOF
// At EOF, write the rest of the buffer.
if ( offset > 0 ) {
int nbytes = (offset + 7) / 8;
outstr.write( buf, 0, nbytes );
bytes_out += nbytes;
///*DEBUG*/ System.out.println("for in_count == " + in_count
// + " wrote final " + nbytes + " bytes");
}
offset = 0;
outstr.flush();
}
}
/** Checks the ratio of output to input and possibly clears the hashtable.
*/
void cl_block () throws IOException {
int rat; // current ratio
checkpoint = in_count + CHECK_GAP;
if(in_count > 0x007fffff) { /* shift will overflow */
rat = bytes_out >> 8;
rat = (rat==0) ? 0x7fffffff : in_count / rat;
}
else
rat = (in_count << 8) / bytes_out; /* 8 fractional bits */
if ( rat > ratio )
ratio = rat;
else {
ratio = 0;
cl_hash ( hsize );
free_ent = FIRST;
clear_flg = true;
output ( CLEAR );
}
}
/** Clears the hashtable to all -1's.
*/
void cl_hash(int hsize) { /* reset code table */
for (int i = 0; i< hsize; i++) htab[i] = -1;
}
} // end class CompressEngine