1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package lzw implements the Lempel-Ziv-Welch compressed data format,
6// described in T. A. Welch, ``A Technique for High-Performance Data
7// Compression'', Computer, 17(6) (June 1984), pp 8-19.
8//
9// In particular, it implements LZW as used by the TIFF file format, including
10// an "off by one" algorithmic difference when compared to standard LZW.
11package lzw // import "golang.org/x/image/tiff/lzw"
12
13/*
14This file was branched from src/pkg/compress/lzw/reader.go in the
15standard library. Differences from the original are marked with "NOTE".
16
17The tif_lzw.c file in the libtiff C library has this comment:
18
19----
20The 5.0 spec describes a different algorithm than Aldus
21implements. Specifically, Aldus does code length transitions
22one code earlier than should be done (for real LZW).
23Earlier versions of this library implemented the correct
24LZW algorithm, but emitted codes in a bit order opposite
25to the TIFF spec. Thus, to maintain compatibility w/ Aldus
26we interpret MSB-LSB ordered codes to be images written w/
27old versions of this library, but otherwise adhere to the
28Aldus "off by one" algorithm.
29----
30
31The Go code doesn't read (invalid) TIFF files written by old versions of
32libtiff, but the LZW algorithm in this package still differs from the one in
33Go's standard package library to accomodate this "off by one" in valid TIFFs.
34*/
35
36import (
37 "bufio"
38 "errors"
39 "fmt"
40 "io"
41)
42
43// Order specifies the bit ordering in an LZW data stream.
44type Order int
45
46const (
47 // LSB means Least Significant Bits first, as used in the GIF file format.
48 LSB Order = iota
49 // MSB means Most Significant Bits first, as used in the TIFF and PDF
50 // file formats.
51 MSB
52)
53
54const (
55 maxWidth = 12
56 decoderInvalidCode = 0xffff
57 flushBuffer = 1 << maxWidth
58)
59
60// decoder is the state from which the readXxx method converts a byte
61// stream into a code stream.
62type decoder struct {
63 r io.ByteReader
64 bits uint32
65 nBits uint
66 width uint
67 read func(*decoder) (uint16, error) // readLSB or readMSB
68 litWidth int // width in bits of literal codes
69 err error
70
71 // The first 1<<litWidth codes are literal codes.
72 // The next two codes mean clear and EOF.
73 // Other valid codes are in the range [lo, hi] where lo := clear + 2,
74 // with the upper bound incrementing on each code seen.
75 // overflow is the code at which hi overflows the code width. NOTE: TIFF's LZW is "off by one".
76 // last is the most recently seen code, or decoderInvalidCode.
77 clear, eof, hi, overflow, last uint16
78
79 // Each code c in [lo, hi] expands to two or more bytes. For c != hi:
80 // suffix[c] is the last of these bytes.
81 // prefix[c] is the code for all but the last byte.
82 // This code can either be a literal code or another code in [lo, c).
83 // The c == hi case is a special case.
84 suffix [1 << maxWidth]uint8
85 prefix [1 << maxWidth]uint16
86
87 // output is the temporary output buffer.
88 // Literal codes are accumulated from the start of the buffer.
89 // Non-literal codes decode to a sequence of suffixes that are first
90 // written right-to-left from the end of the buffer before being copied
91 // to the start of the buffer.
92 // It is flushed when it contains >= 1<<maxWidth bytes,
93 // so that there is always room to decode an entire code.
94 output [2 * 1 << maxWidth]byte
95 o int // write index into output
96 toRead []byte // bytes to return from Read
97}
98
99// readLSB returns the next code for "Least Significant Bits first" data.
100func (d *decoder) readLSB() (uint16, error) {
101 for d.nBits < d.width {
102 x, err := d.r.ReadByte()
103 if err != nil {
104 return 0, err
105 }
106 d.bits |= uint32(x) << d.nBits
107 d.nBits += 8
108 }
109 code := uint16(d.bits & (1<<d.width - 1))
110 d.bits >>= d.width
111 d.nBits -= d.width
112 return code, nil
113}
114
115// readMSB returns the next code for "Most Significant Bits first" data.
116func (d *decoder) readMSB() (uint16, error) {
117 for d.nBits < d.width {
118 x, err := d.r.ReadByte()
119 if err != nil {
120 return 0, err
121 }
122 d.bits |= uint32(x) << (24 - d.nBits)
123 d.nBits += 8
124 }
125 code := uint16(d.bits >> (32 - d.width))
126 d.bits <<= d.width
127 d.nBits -= d.width
128 return code, nil
129}
130
131func (d *decoder) Read(b []byte) (int, error) {
132 for {
133 if len(d.toRead) > 0 {
134 n := copy(b, d.toRead)
135 d.toRead = d.toRead[n:]
136 return n, nil
137 }
138 if d.err != nil {
139 return 0, d.err
140 }
141 d.decode()
142 }
143}
144
145// decode decompresses bytes from r and leaves them in d.toRead.
146// read specifies how to decode bytes into codes.
147// litWidth is the width in bits of literal codes.
148func (d *decoder) decode() {
149 // Loop over the code stream, converting codes into decompressed bytes.
150loop:
151 for {
152 code, err := d.read(d)
153 if err != nil {
154 if err == io.EOF {
155 err = io.ErrUnexpectedEOF
156 }
157 d.err = err
158 break
159 }
160 switch {
161 case code < d.clear:
162 // We have a literal code.
163 d.output[d.o] = uint8(code)
164 d.o++
165 if d.last != decoderInvalidCode {
166 // Save what the hi code expands to.
167 d.suffix[d.hi] = uint8(code)
168 d.prefix[d.hi] = d.last
169 }
170 case code == d.clear:
171 d.width = 1 + uint(d.litWidth)
172 d.hi = d.eof
173 d.overflow = 1 << d.width
174 d.last = decoderInvalidCode
175 continue
176 case code == d.eof:
177 d.err = io.EOF
178 break loop
179 case code <= d.hi:
180 c, i := code, len(d.output)-1
181 if code == d.hi && d.last != decoderInvalidCode {
182 // code == hi is a special case which expands to the last expansion
183 // followed by the head of the last expansion. To find the head, we walk
184 // the prefix chain until we find a literal code.
185 c = d.last
186 for c >= d.clear {
187 c = d.prefix[c]
188 }
189 d.output[i] = uint8(c)
190 i--
191 c = d.last
192 }
193 // Copy the suffix chain into output and then write that to w.
194 for c >= d.clear {
195 d.output[i] = d.suffix[c]
196 i--
197 c = d.prefix[c]
198 }
199 d.output[i] = uint8(c)
200 d.o += copy(d.output[d.o:], d.output[i:])
201 if d.last != decoderInvalidCode {
202 // Save what the hi code expands to.
203 d.suffix[d.hi] = uint8(c)
204 d.prefix[d.hi] = d.last
205 }
206 default:
207 d.err = errors.New("lzw: invalid code")
208 break loop
209 }
210 d.last, d.hi = code, d.hi+1
211 if d.hi+1 >= d.overflow { // NOTE: the "+1" is where TIFF's LZW differs from the standard algorithm.
212 if d.width == maxWidth {
213 d.last = decoderInvalidCode
214 } else {
215 d.width++
216 d.overflow <<= 1
217 }
218 }
219 if d.o >= flushBuffer {
220 break
221 }
222 }
223 // Flush pending output.
224 d.toRead = d.output[:d.o]
225 d.o = 0
226}
227
228var errClosed = errors.New("lzw: reader/writer is closed")
229
230func (d *decoder) Close() error {
231 d.err = errClosed // in case any Reads come along
232 return nil
233}
234
235// NewReader creates a new io.ReadCloser.
236// Reads from the returned io.ReadCloser read and decompress data from r.
237// If r does not also implement io.ByteReader,
238// the decompressor may read more data than necessary from r.
239// It is the caller's responsibility to call Close on the ReadCloser when
240// finished reading.
241// The number of bits to use for literal codes, litWidth, must be in the
242// range [2,8] and is typically 8. It must equal the litWidth
243// used during compression.
244func NewReader(r io.Reader, order Order, litWidth int) io.ReadCloser {
245 d := new(decoder)
246 switch order {
247 case LSB:
248 d.read = (*decoder).readLSB
249 case MSB:
250 d.read = (*decoder).readMSB
251 default:
252 d.err = errors.New("lzw: unknown order")
253 return d
254 }
255 if litWidth < 2 || 8 < litWidth {
256 d.err = fmt.Errorf("lzw: litWidth %d out of range", litWidth)
257 return d
258 }
259 if br, ok := r.(io.ByteReader); ok {
260 d.r = br
261 } else {
262 d.r = bufio.NewReader(r)
263 }
264 d.litWidth = litWidth
265 d.width = 1 + uint(litWidth)
266 d.clear = uint16(1) << uint(litWidth)
267 d.eof, d.hi = d.clear+1, d.clear+1
268 d.overflow = uint16(1) << d.width
269 d.last = decoderInvalidCode
270
271 return d
272}