001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.compress.harmony.pack200; 018 019import java.io.EOFException; 020import java.io.IOException; 021import java.io.InputStream; 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.List; 025 026import org.apache.commons.compress.utils.ExactMath; 027 028/** 029 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" 030 * encoding mechanism. It uses a variable-length encoding and a modified sign representation such that small numbers are 031 * represented as a single byte, whilst larger numbers take more bytes to encode. The number may be signed or unsigned; 032 * if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's complement. The 033 * Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. 034 * So a delta encoding of the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute 035 * value of a coded integer to fall outside of the 'small number' range, whilst still being encoded as a single byte. 036 * 037 * A BHSD codec is configured with four parameters: 038 * <dl> 039 * <dt>B</dt> 040 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through 041 * coding (where each byte is encoded as itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd> 042 * <dt>H</dt> 043 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by 044 * {@code H^<sup>n</sup>}. So the number 1234 may be represented as the sequence 4 3 2 1 with a radix (H) of 10. 045 * Note that other permutations are also possible; 43 2 1 will also encode 1234. The co-parameter L is defined as 256-H. 046 * This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd> 047 * <dt>S</dt> 048 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's 049 * complement) or 2 (signed, two's complement)</dd> 050 * <dt>D</dt> 051 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding 052 * of 1 indicates that values are cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence 053 * {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one 054 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. 055 * If the codec is a non-delta encoding, then the value is ignored if passed. If the codec is a delta encoding, it is a 056 * run-time error to call the value without the extra parameter, and the previous value should be returned. (It was 057 * designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for 058 * each use.) 059 * </dd> 060 * </dl> 061 * 062 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted 063 * (1,256,0,0) or (1,256). The {@link #toString()} method prints out the condensed form of the encoding. Often, the last 064 * character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B value. Those that start with U 065 * ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the 066 * word Delta ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used. 067 * 068 */ 069public final class BHSDCodec extends Codec { 070 071 /** 072 * The maximum number of bytes in each coding word 073 */ 074 private final int b; 075 076 /** 077 * Whether delta encoding is used (0=false,1=true) 078 */ 079 private final int d; 080 081 /** 082 * The radix of the encoding 083 */ 084 private final int h; 085 086 /** 087 * The co-parameter of h; 256-h 088 */ 089 private final int l; 090 091 /** 092 * Represents signed numbers or not (0=unsigned,1/2=signed) 093 */ 094 private final int s; 095 096 private long cardinality; 097 098 private final long smallest; 099 100 private final long largest; 101 102 /** 103 * radix^i powers 104 */ 105 private final long[] powers; 106 107 /** 108 * Constructs an unsigned, non-delta Codec with the given B and H values. 109 * 110 * @param b the maximum number of bytes that a value can be encoded as [1..5] 111 * @param h the radix of the encoding [1..256] 112 */ 113 public BHSDCodec(final int b, final int h) { 114 this(b, h, 0, 0); 115 } 116 117 /** 118 * Constructs a non-delta Codec with the given B, H and S values. 119 * 120 * @param b the maximum number of bytes that a value can be encoded as [1..5] 121 * @param h the radix of the encoding [1..256] 122 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 123 * is signed with ?) 124 */ 125 public BHSDCodec(final int b, final int h, final int s) { 126 this(b, h, s, 0); 127 } 128 129 /** 130 * Constructs a Codec with the given B, H, S and D values. 131 * 132 * @param b the maximum number of bytes that a value can be encoded as [1..5] 133 * @param h the radix of the encoding [1..256] 134 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 135 * is signed with ?) 136 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta) 137 */ 138 public BHSDCodec(final int b, final int h, final int s, final int d) { 139 if (b < 1 || b > 5) { 140 throw new IllegalArgumentException("1<=b<=5"); 141 } 142 if (h < 1 || h > 256) { 143 throw new IllegalArgumentException("1<=h<=256"); 144 } 145 if (s < 0 || s > 2) { 146 throw new IllegalArgumentException("0<=s<=2"); 147 } 148 if (d < 0 || d > 1) { 149 throw new IllegalArgumentException("0<=d<=1"); 150 } 151 if (b == 1 && h != 256) { 152 throw new IllegalArgumentException("b=1 -> h=256"); 153 } 154 if (h == 256 && b == 5) { 155 throw new IllegalArgumentException("h=256 -> b!=5"); 156 } 157 this.b = b; 158 this.h = h; 159 this.s = s; 160 this.d = d; 161 this.l = 256 - h; 162 if (h == 1) { 163 cardinality = b * 255 + 1; 164 } else { 165 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b)); 166 } 167 smallest = calculateSmallest(); 168 largest = calculateLargest(); 169 170 powers = new long[b]; 171 Arrays.setAll(powers, c -> (long) Math.pow(h, c)); 172 } 173 174 private long calculateLargest() { 175 long result; 176 // TODO This can probably be optimized into a better mathematical 177 // statement 178 if (d == 1) { 179 final BHSDCodec bh0 = new BHSDCodec(b, h); 180 return bh0.largest(); 181 } 182 switch (s) { 183 case 0: 184 result = cardinality() - 1; 185 break; 186 case 1: 187 result = cardinality() / 2 - 1; 188 break; 189 case 2: 190 result = (3L * cardinality()) / 4 - 1; 191 break; 192 default: 193 throw new Error("Unknown s value"); 194 } 195 return Math.min((s == 0 ? ((long) Integer.MAX_VALUE) << 1 : Integer.MAX_VALUE) - 1, result); 196 } 197 198 private long calculateSmallest() { 199 long result; 200 if (d == 1 || !isSigned()) { 201 if (cardinality >= 4294967296L) { // 2^32 202 result = Integer.MIN_VALUE; 203 } else { 204 result = 0; 205 } 206 } else { 207 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s)); 208 } 209 return result; 210 } 211 212 /** 213 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain. 214 * 215 * @return the cardinality of this codec 216 */ 217 public long cardinality() { 218 return cardinality; 219 } 220 221 @Override 222 public int decode(final InputStream in) throws IOException, Pack200Exception { 223 if (d != 0) { 224 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error"); 225 } 226 return decode(in, 0); 227 } 228 229 @Override 230 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception { 231 int n = 0; 232 long z = 0; 233 long x = 0; 234 235 do { 236 x = in.read(); 237 lastBandLength++; 238 z += x * powers[n]; 239 n++; 240 } while (x >= l && n < b); 241 242 if (x == -1) { 243 throw new EOFException("End of stream reached whilst decoding"); 244 } 245 246 if (isSigned()) { 247 final int u = ((1 << s) - 1); 248 if ((z & u) == u) { 249 z = z >>> s ^ -1L; 250 } else { 251 z = z - (z >>> s); 252 } 253 } 254 // This algorithm does the same thing, but is probably slower. Leaving 255 // in for now for readability 256 // if (isSigned()) { 257 // long u = z; 258 // long twoPowS = (long)Math.pow(2, s); 259 // double twoPowSMinusOne = twoPowS-1; 260 // if (u % twoPowS < twoPowSMinusOne) { 261 // if (cardinality < Math.pow(2, 32)) { 262 // z = (long) (u - (Math.floor(u/ twoPowS))); 263 // } else { 264 // z = cast32((long) (u - (Math.floor(u/ twoPowS)))); 265 // } 266 // } else { 267 // z = (long) (-Math.floor(u/ twoPowS) - 1); 268 // } 269 // } 270 if (isDelta()) { 271 z += last; 272 } 273 return (int) z; 274 } 275 276 // private long cast32(long u) { 277 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) - 278 // Math.pow(2, 31)); 279 // return u; 280 // } 281 282 @Override 283 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception { 284 final int[] band = super.decodeInts(n, in); 285 if (isDelta()) { 286 for (int i = 0; i < band.length; i++) { 287 while (band[i] > largest) { 288 band[i] -= cardinality; 289 } 290 while (band[i] < smallest) { 291 band[i] = ExactMath.add(band[i], cardinality); 292 } 293 } 294 } 295 return band; 296 } 297 298 @Override 299 public int[] decodeInts(final int n, final InputStream in, final int firstValue) 300 throws IOException, Pack200Exception { 301 final int[] band = super.decodeInts(n, in, firstValue); 302 if (isDelta()) { 303 for (int i = 0; i < band.length; i++) { 304 while (band[i] > largest) { 305 band[i] -= cardinality; 306 } 307 while (band[i] < smallest) { 308 band[i] = ExactMath.add(band[i], cardinality); 309 } 310 } 311 } 312 return band; 313 } 314 315 @Override 316 public byte[] encode(final int value) throws Pack200Exception { 317 return encode(value, 0); 318 } 319 320 @Override 321 public byte[] encode(final int value, final int last) throws Pack200Exception { 322 if (!encodes(value)) { 323 throw new Pack200Exception("The codec " + this + " does not encode the value " + value); 324 } 325 326 long z = value; 327 if (isDelta()) { 328 z -= last; 329 } 330 if (isSigned()) { 331 if (z < Integer.MIN_VALUE) { 332 z += 4294967296L; 333 } else if (z > Integer.MAX_VALUE) { 334 z -= 4294967296L; 335 } 336 if (z < 0) { 337 z = (-z << s) - 1; 338 } else if (s == 1) { 339 z = z << s; 340 } else { 341 z += (z - z % 3) / 3; 342 } 343 } else if (z < 0) { 344 // Need to use integer overflow here to represent negatives. 345 // 4294967296L is the 1 << 32. 346 z += Math.min(cardinality, 4294967296L); 347 } 348 if (z < 0) { 349 throw new Pack200Exception("unable to encode"); 350 } 351 352 final List<Byte> byteList = new ArrayList<>(); 353 for (int n = 0; n < b; n++) { 354 long byteN; 355 if (z < l) { 356 byteN = z; 357 } else { 358 byteN = z % h; 359 while (byteN < l) { 360 byteN += h; 361 } 362 } 363 byteList.add(Byte.valueOf((byte) byteN)); 364 if (byteN < l) { 365 break; 366 } 367 z -= byteN; 368 z /= h; 369 } 370 final byte[] bytes = new byte[byteList.size()]; 371 for (int i = 0; i < bytes.length; i++) { 372 bytes[i] = byteList.get(i).byteValue(); 373 } 374 return bytes; 375 } 376 377 /** 378 * True if this encoding can code the given value 379 * 380 * @param value the value to check 381 * @return {@code true} if the encoding can encode this value 382 */ 383 public boolean encodes(final long value) { 384 return value >= smallest && value <= largest; 385 } 386 387 @Override 388 public boolean equals(final Object o) { 389 if (o instanceof BHSDCodec) { 390 final BHSDCodec codec = (BHSDCodec) o; 391 return codec.b == b && codec.h == h && codec.s == s && codec.d == d; 392 } 393 return false; 394 } 395 396 /** 397 * @return the b 398 */ 399 public int getB() { 400 return b; 401 } 402 403 /** 404 * @return the h 405 */ 406 public int getH() { 407 return h; 408 } 409 410 /** 411 * @return the l 412 */ 413 public int getL() { 414 return l; 415 } 416 417 /** 418 * @return the s 419 */ 420 public int getS() { 421 return s; 422 } 423 424 @Override 425 public int hashCode() { 426 return ((b * 37 + h) * 37 + s) * 37 + d; 427 } 428 429 /** 430 * Returns true if this codec is a delta codec 431 * 432 * @return true if this codec is a delta codec 433 */ 434 public boolean isDelta() { 435 return d != 0; 436 } 437 438 /** 439 * Returns true if this codec is a signed codec 440 * 441 * @return true if this codec is a signed codec 442 */ 443 public boolean isSigned() { 444 return s != 0; 445 } 446 447 /** 448 * Returns the largest value that this codec can represent. 449 * 450 * @return the largest value that this codec can represent. 451 */ 452 public long largest() { 453 return largest; 454 } 455 456 /** 457 * Returns the smallest value that this codec can represent. 458 * 459 * @return the smallest value that this codec can represent. 460 */ 461 public long smallest() { 462 return smallest; 463 } 464 465 /** 466 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown. 467 */ 468 @Override 469 public String toString() { 470 final StringBuilder buffer = new StringBuilder(11); 471 buffer.append('('); 472 buffer.append(b); 473 buffer.append(','); 474 buffer.append(h); 475 if (s != 0 || d != 0) { 476 buffer.append(','); 477 buffer.append(s); 478 } 479 if (d != 0) { 480 buffer.append(','); 481 buffer.append(d); 482 } 483 buffer.append(')'); 484 return buffer.toString(); 485 } 486}