1 // Copyright 2019 Tero Hänninen. All rights reserved.
2 // SPDX-License-Identifier: BSD-2-Clause
3 module imagefmt.jpeg;
4 
5 import std.math : ceil;
6 import imagefmt;
7 
8 @nogc nothrow package:
9 
10 struct JPEGDecoder {
11     Reader* rc;
12 
13     ubyte[64][4] qtables;
14     HuffTab[2] ac_tables;
15     HuffTab[2] dc_tables;
16 
17     int bits_left;   // num of unused bits in cb
18     ubyte cb;  // current byte (next bit always at MSB)
19 
20     bool has_frame_header = false;
21     bool eoi_reached = false;
22 
23     bool correct_comp_ids;
24     ubyte[] compsbuf;
25     Component[3] comps;
26     ubyte num_comps;
27     int tchans;
28 
29     int width;
30     int height;
31     int hmax;
32     int vmax;
33     int num_mcu_x;
34     int num_mcu_y;
35 
36     ushort restart_interval;    // number of MCUs in restart interval
37 }
38 
39 // image component
40 struct Component {
41     size_t x;       // total num of samples, without fill samples
42     size_t y;       // total num of samples, without fill samples
43     ubyte[] data;   // reconstructed samples
44     int pred;       // dc prediction
45     ubyte sfx;      // sampling factor, aka. h
46     ubyte sfy;      // sampling factor, aka. v
47     ubyte qtable;
48     ubyte ac_table;
49     ubyte dc_table;
50 }
51 
52 struct HuffTab {
53     ubyte[256] values;
54     ubyte[257] sizes;
55     short[16] mincode;
56     short[16] maxcode;
57     short[16] valptr;
58 }
59 
60 enum MARKER : ubyte {
61     SOI = 0xd8,     // start of image
62     SOF0 = 0xc0,    // start of frame / baseline DCT
63     //SOF1 = 0xc1,    // start of frame / extended seq.
64     //SOF2 = 0xc2,    // start of frame / progressive DCT
65     SOF3 = 0xc3,    // start of frame / lossless
66     SOF9 = 0xc9,    // start of frame / extended seq., arithmetic
67     SOF11 = 0xcb,    // start of frame / lossless, arithmetic
68     DHT = 0xc4,     // define huffman tables
69     DQT = 0xdb,     // define quantization tables
70     DRI = 0xdd,     // define restart interval
71     SOS = 0xda,     // start of scan
72     DNL = 0xdc,     // define number of lines
73     RST0 = 0xd0,    // restart entropy coded data
74     // ...
75     RST7 = 0xd7,    // restart entropy coded data
76     APP0 = 0xe0,    // application 0 segment
77     // ...
78     APPf = 0xef,    // application f segment
79     //DAC = 0xcc,     // define arithmetic conditioning table
80     COM = 0xfe,     // comment
81     EOI = 0xd9,     // end of image
82 }
83 
84 bool detect_jpeg(Reader* rc)
85 {
86     auto info = read_jpeg_info(rc);
87     reset2start(rc);
88     return info.e == 0;
89 }
90 
91 IFInfo infoerror(ubyte e)
92 {
93     IFInfo info = { e: e };
94     return info;
95 }
96 
97 IFInfo read_jpeg_info(Reader* rc)
98 {
99     if (read_u8(rc) != 0xff || read_u8(rc) != MARKER.SOI)
100        return infoerror(rc.fail ? ERROR.stream : ERROR.data);
101 
102     while (true) {
103         if (read_u8(rc) != 0xff)
104             return infoerror(rc.fail ? ERROR.stream : ERROR.data);
105 
106         ubyte marker = read_u8(rc);
107         while (marker == 0xff && !rc.fail)
108             marker = read_u8(rc);
109 
110         if (rc.fail)
111             return infoerror(ERROR.stream);
112 
113         switch (marker) with (MARKER) {
114             case SOF0: .. case SOF3:
115             case SOF9: .. case SOF11:
116                 skip(rc, 3); // len + some byte
117                 IFInfo info;
118                 info.h = read_u16be(rc);
119                 info.w = read_u16be(rc);
120                 info.c = read_u8(rc);
121                 info.e = rc.fail ? ERROR.stream : 0;
122                 return info;
123             case SOS, EOI:
124                 return infoerror(ERROR.data);
125             case DRI, DHT, DQT, COM:
126             case APP0: .. case APPf:
127                 int len = read_u16be(rc) - 2;
128                 skip(rc, len);
129                 break;
130             default:
131                 return infoerror(ERROR.unsupp);
132         }
133     }
134     assert(0);
135 }
136 
137 ubyte read_jpeg(Reader* rc, out IFImage image, in int reqchans, in int reqbpc)
138 {
139     if (cast(uint) reqchans > 4)
140         return ERROR.arg;
141     const ubyte tbpc = cast(ubyte) (reqbpc ? reqbpc : 8);
142     if (tbpc != 8 && tbpc != 16)
143         return ERROR.unsupp;
144     if (read_u8(rc) != 0xff || read_u8(rc) != MARKER.SOI)
145        return rc.fail ? ERROR.stream : ERROR.data;
146     if (rc.fail)
147         return ERROR.stream;
148 
149     JPEGDecoder dc = { rc: rc };
150 
151     ubyte e = read_markers(dc);   // reads until first scan header or eoi
152 
153     if (e) return e;
154     if (dc.eoi_reached) return ERROR.data;
155 
156     dc.tchans = reqchans == 0 ? dc.num_comps : reqchans;
157 
158     if (cast(ulong) dc.width * dc.height * dc.tchans > MAXIMUM_IMAGE_SIZE)
159         return ERROR.bigimg;
160 
161     {
162         size_t[3] csizes;
163         size_t acc;
164         foreach (i, ref comp; dc.comps[0..dc.num_comps]) {
165             csizes[i] = dc.num_mcu_x * comp.sfx*8 * dc.num_mcu_y * comp.sfy*8;
166             acc += csizes[i];
167         }
168         dc.compsbuf = new_buffer(acc, e);
169         if (e) return e;
170         acc = 0;
171         foreach (i, ref comp; dc.comps[0..dc.num_comps]) {
172             comp.data = dc.compsbuf[acc .. acc + csizes[i]];
173             acc += csizes[i];
174         }
175     }
176     scope(exit)
177         _free(dc.compsbuf.ptr);
178 
179     // E.7 -- Multiple scans are for progressive images which are not supported
180     //while (!dc.eoi_reached) {
181         e = decode_scan(dc);    // E.2.3
182         //read_markers(dc);   // reads until next scan header or eoi
183     //}
184     if (e) return e;
185 
186     // throw away fill samples and convert to target format
187     ubyte[] buf = dc.reconstruct(e);
188     if (e) return e;
189 
190     if (VERTICAL_ORIENTATION_READ == -1)
191         e = flip_vertically(dc.width, dc.height, dc.tchans, buf);
192 
193     image.w   = dc.width;
194     image.h   = dc.height;
195     image.c   = cast(ubyte) dc.tchans;
196     image.cinfile = dc.num_comps;
197     image.bpc = tbpc;
198     if (tbpc == 8) {
199         image.buf8 = buf;
200     } else {
201         image.buf16 = bpc8to16(buf);
202         if (!image.buf16.ptr)
203             return ERROR.oom;
204     }
205     return 0;
206 }
207 
208 ubyte read_markers(ref JPEGDecoder dc)
209 {
210     bool has_next_scan_header = false;
211     while (!has_next_scan_header && !dc.eoi_reached) {
212         if (read_u8(dc.rc) != 0xff)
213             return dc.rc.fail ? ERROR.stream : ERROR.data;
214 
215         ubyte marker = read_u8(dc.rc);
216         while (marker == 0xff && !dc.rc.fail)
217             marker = read_u8(dc.rc);
218 
219         if (dc.rc.fail)
220             return ERROR.stream;
221 
222         ubyte e;
223         switch (marker) with (MARKER) {
224             case DHT:
225                 e = read_huffman_tables(dc);
226                 break;
227             case DQT:
228                 e = read_quantization_tables(dc);
229                 break;
230             case SOF0:
231                 if (dc.has_frame_header)
232                     return ERROR.data;
233                 e = read_frame_header(dc);
234                 dc.has_frame_header = true;
235                 break;
236             case SOS:
237                 if (!dc.has_frame_header)
238                     return ERROR.data;
239                 e = read_scan_header(dc);
240                 has_next_scan_header = true;
241                 break;
242             case DRI:
243                 if (read_u16be(dc.rc) != 4)  // len
244                     return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
245                 dc.restart_interval = read_u16be(dc.rc);
246                 break;
247             case EOI:
248                 dc.eoi_reached = true;
249                 break;
250             case APP0: .. case APPf:
251             case COM:
252                 const int len = read_u16be(dc.rc) - 2;
253                 skip(dc.rc, len);
254                 break;
255             default:
256                 return ERROR.unsupp;
257         }
258         if (e)
259             return dc.rc.fail ? ERROR.stream : e;
260     }
261     return 0;
262 }
263 
264 // DHT -- define huffman tables
265 ubyte read_huffman_tables(ref JPEGDecoder dc)
266 {
267     ubyte[17] tmp;
268     int len = read_u16be(dc.rc) - 2;
269     if (dc.rc.fail) return ERROR.stream;
270 
271     ubyte e;
272     while (len > 0) {
273         read_block(dc.rc, tmp[0..17]);        // info byte & the BITS
274         if (dc.rc.fail) return ERROR.stream;
275         const ubyte tableslot  = tmp[0] & 0x0f; // must be 0 or 1 for baseline
276         const ubyte tableclass = tmp[0] >> 4;   // 0 = dc table, 1 = ac table
277         if (tableslot > 1 || tableclass > 1)
278             return ERROR.unsupp;
279 
280         // compute total number of huffman codes
281         int mt = 0;
282         foreach (i; 1..17)
283             mt += tmp[i];
284         if (256 < mt)
285             return ERROR.data;
286 
287         if (tableclass == 0) {
288             read_block(dc.rc, dc.dc_tables[tableslot].values[0..mt]);
289             derive_table(dc.dc_tables[tableslot], tmp[1..17], e);
290         } else {
291             read_block(dc.rc, dc.ac_tables[tableslot].values[0..mt]);
292             derive_table(dc.ac_tables[tableslot], tmp[1..17], e);
293         }
294 
295         len -= 17 + mt;
296     }
297     if (dc.rc.fail) return ERROR.stream;
298     return e;
299 }
300 
301 // num_values is the BITS
302 void derive_table(ref HuffTab table, in ref ubyte[16] num_values, ref ubyte e)
303 {
304     short[256] codes;
305 
306     int k = 0;
307     foreach (i; 0..16) {
308         foreach (j; 0..num_values[i]) {
309             if (k > table.sizes.length) {
310                 e = ERROR.data;
311                 return;
312             }
313             table.sizes[k] = cast(ubyte) (i + 1);
314             ++k;
315         }
316     }
317     table.sizes[k] = 0;
318 
319     k = 0;
320     short code = 0;
321     ubyte si = table.sizes[k];
322     while (true) {
323         do {
324             codes[k] = code;
325             ++code;
326             ++k;
327         } while (si == table.sizes[k]);
328 
329         if (table.sizes[k] == 0)
330             break;
331 
332         assert(si < table.sizes[k]);
333         do {
334             code <<= 1;
335             ++si;
336         } while (si != table.sizes[k]);
337     }
338 
339     derive_mincode_maxcode_valptr(
340         table.mincode, table.maxcode, table.valptr,
341         codes, num_values
342     );
343 }
344 
345 // F.15
346 void derive_mincode_maxcode_valptr(ref short[16] mincode, ref short[16] maxcode,
347      ref short[16] valptr, in ref short[256] codes, in ref ubyte[16] num_values)
348 {
349     mincode[] = -1;
350     maxcode[] = -1;
351     valptr[] = -1;
352 
353     int j = 0;
354     foreach (i; 0..16) {
355         if (num_values[i] != 0) {
356             valptr[i] = cast(short) j;
357             mincode[i] = codes[j];
358             j += num_values[i] - 1;
359             maxcode[i] = codes[j];
360             j += 1;
361         }
362     }
363 }
364 
365 // DQT -- define quantization tables
366 ubyte read_quantization_tables(ref JPEGDecoder dc)
367 {
368     int len = read_u16be(dc.rc);
369     if (len % 65 != 2)
370         return dc.rc.fail ? ERROR.stream : ERROR.data;
371     len -= 2;
372     while (len > 0) {
373         const ubyte info = read_u8(dc.rc);
374         const ubyte tableslot = info & 0x0f;
375         const ubyte precision = info >> 4;  // 0 = 8 bit, 1 = 16 bit
376         if (tableslot > 3 || precision != 0) // only 8 bit for baseline
377             return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
378         read_block(dc.rc, dc.qtables[tableslot][0..64]);
379         len -= 1 + 64;
380     }
381     return dc.rc.fail ? ERROR.stream : 0;
382 }
383 
384 // SOF0 -- start of frame
385 ubyte read_frame_header(ref JPEGDecoder dc)
386 {
387     const int len = read_u16be(dc.rc);  // 8 + num_comps*3
388     const ubyte precision = read_u8(dc.rc);
389     dc.height = read_u16be(dc.rc);
390     dc.width = read_u16be(dc.rc);
391     dc.num_comps = read_u8(dc.rc);
392 
393     if ((dc.num_comps != 1 && dc.num_comps != 3)
394      || precision != 8 || len != 8 + dc.num_comps*3)
395         return ERROR.unsupp;
396 
397     dc.hmax = 0;
398     dc.vmax = 0;
399     int mcu_du = 0; // data units in one mcu
400     ubyte[9] tmp;
401     read_block(dc.rc, tmp[0..dc.num_comps*3]);
402     if (dc.rc.fail) return ERROR.stream;
403     foreach (i; 0..dc.num_comps) {
404         ubyte ci = tmp[i*3];
405         // JFIF says ci should be i+1, but there are images where ci is i. Normalize
406         // ids so that ci == i, always. So much for standards...
407         if (i == 0) { dc.correct_comp_ids = ci == i+1; }
408         if ((dc.correct_comp_ids && ci != i+1)
409         || (!dc.correct_comp_ids && ci != i))
410             return ERROR.data;
411 
412         Component* comp = &dc.comps[i];
413         const ubyte sampling_factors = tmp[i*3 + 1];
414         comp.sfx = sampling_factors >> 4;
415         comp.sfy = sampling_factors & 0xf;
416         comp.qtable = tmp[i*3 + 2];
417         if ( comp.sfy < 1 || 4 < comp.sfy ||
418              comp.sfx < 1 || 4 < comp.sfx ||
419              3 < comp.qtable )
420             return ERROR.unsupp;
421 
422         if (dc.hmax < comp.sfx) dc.hmax = comp.sfx;
423         if (dc.vmax < comp.sfy) dc.vmax = comp.sfy;
424 
425         mcu_du += comp.sfx * comp.sfy;
426     }
427     if (10 < mcu_du)
428         return ERROR.unsupp;
429 
430     assert(dc.hmax * dc.vmax);
431     foreach (i; 0..dc.num_comps) {
432         dc.comps[i].x = cast(size_t) ceil(dc.width * (cast(double) dc.comps[i].sfx /
433                     dc.hmax));
434         dc.comps[i].y = cast(size_t) ceil(dc.height * (cast(double) dc.comps[i].sfy /
435                     dc.vmax));
436     }
437 
438     size_t mcu_w = dc.hmax * 8;
439     size_t mcu_h = dc.vmax * 8;
440     dc.num_mcu_x = cast(int) ((dc.width + mcu_w-1) / mcu_w);
441     dc.num_mcu_y = cast(int) ((dc.height + mcu_h-1) / mcu_h);
442     return 0;
443 }
444 
445 // SOS -- start of scan
446 ubyte read_scan_header(ref JPEGDecoder dc)
447 {
448     const ushort len = read_u16be(dc.rc);
449     const ubyte num_scan_comps = read_u8(dc.rc);
450 
451     if ( num_scan_comps != dc.num_comps ||
452          len != (6+num_scan_comps*2) )
453         return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
454 
455     ubyte[16] buf;
456     ubyte e;
457     read_block(dc.rc, buf[0..len-3]);
458     if (dc.rc.fail) return ERROR.stream;
459 
460     foreach (i; 0..num_scan_comps) {
461         const uint ci = buf[i*2] - (dc.correct_comp_ids ? 1 : 0);
462         if (ci >= dc.num_comps)
463             return ERROR.data;
464 
465         const ubyte tables = buf[i*2 + 1];
466         dc.comps[ci].dc_table = tables >> 4;
467         dc.comps[ci].ac_table = tables & 0x0f;
468         if (dc.comps[ci].dc_table > 1 || dc.comps[ci].ac_table > 1)
469             return ERROR.unsupp;
470     }
471 
472     // ignore these
473     //ubyte spectral_start = buf[$-3];
474     //ubyte spectral_end = buf[$-2];
475     //ubyte approx = buf[$-1];
476     return 0;
477 }
478 
479 // E.2.3 and E.8 and E.9
480 ubyte decode_scan(ref JPEGDecoder dc)
481 {
482     int intervals, mcus;
483     if (dc.restart_interval > 0) {
484         int total_mcus = dc.num_mcu_x * dc.num_mcu_y;
485         intervals = (total_mcus + dc.restart_interval-1) / dc.restart_interval;
486         mcus = dc.restart_interval;
487     } else {
488         intervals = 1;
489         mcus = dc.num_mcu_x * dc.num_mcu_y;
490     }
491 
492     ubyte e;
493     foreach (mcu_j; 0 .. dc.num_mcu_y) {
494         foreach (mcu_i; 0 .. dc.num_mcu_x) {
495 
496             // decode mcu
497             foreach (c; 0..dc.num_comps) {
498                 auto comp = &dc.comps[c];
499                 foreach (du_j; 0 .. comp.sfy) {
500                     foreach (du_i; 0 .. comp.sfx) {
501                         // decode entropy, dequantize & dezigzag
502                         short[64] data;
503                         e = decode_block(dc, *comp, dc.qtables[comp.qtable], data);
504                         if (e) return e;
505                         // idct & level-shift
506                         int outx = (mcu_i * comp.sfx + du_i) * 8;
507                         int outy = (mcu_j * comp.sfy + du_j) * 8;
508                         int dst_stride = dc.num_mcu_x * comp.sfx*8;
509                         ubyte* dst = comp.data.ptr + outy*dst_stride + outx;
510                         stbi__idct_block(dst, dst_stride, data);
511                     }
512                 }
513             }
514 
515             --mcus;
516 
517             if (!mcus) {
518                 --intervals;
519                 if (!intervals)
520                     return e;
521 
522                 e = read_restart(dc.rc);    // RSTx marker
523 
524                 if (intervals == 1) {
525                     // last interval, may have fewer MCUs than defined by DRI
526                     mcus = (dc.num_mcu_y - mcu_j - 1)
527                          * dc.num_mcu_x + dc.num_mcu_x - mcu_i - 1;
528                 } else {
529                     mcus = dc.restart_interval;
530                 }
531 
532                 // reset decoder
533                 dc.cb = 0;
534                 dc.bits_left = 0;
535                 foreach (k; 0..dc.num_comps)
536                     dc.comps[k].pred = 0;
537             }
538 
539         }
540     }
541     return e;
542 }
543 
544 // RST0-RST7
545 ubyte read_restart(Reader* rc) {
546     ubyte a = read_u8(rc);
547     ubyte b = read_u8(rc);
548     if (rc.fail) return ERROR.stream;
549     if (a != 0xff || b < MARKER.RST0 || b > MARKER.RST7)
550         return ERROR.data;
551     return 0;
552     // the markers should cycle 0 through 7, could check that here...
553 }
554 
555 immutable ubyte[64] dezigzag = [
556      0,  1,  8, 16,  9,  2,  3, 10,
557     17, 24, 32, 25, 18, 11,  4,  5,
558     12, 19, 26, 33, 40, 48, 41, 34,
559     27, 20, 13,  6,  7, 14, 21, 28,
560     35, 42, 49, 56, 57, 50, 43, 36,
561     29, 22, 15, 23, 30, 37, 44, 51,
562     58, 59, 52, 45, 38, 31, 39, 46,
563     53, 60, 61, 54, 47, 55, 62, 63,
564 ];
565 
566 // decode entropy, dequantize & dezigzag (see section F.2)
567 ubyte decode_block(ref JPEGDecoder dc, ref Component comp, in ref ubyte[64] qtable,
568                                                               out short[64] result)
569 {
570     ubyte e;
571     const ubyte t = decode_huff(dc, dc.dc_tables[comp.dc_table], e);
572     if (e) return e;
573     const int diff = t ? dc.receive_and_extend(t, e) : 0;
574 
575     comp.pred = comp.pred + diff;
576     result[0] = cast(short) (comp.pred * qtable[0]);
577 
578     int k = 1;
579     do {
580         ubyte rs = decode_huff(dc, dc.ac_tables[comp.ac_table], e);
581         if (e) return e;
582         ubyte rrrr = rs >> 4;
583         ubyte ssss = rs & 0xf;
584 
585         if (ssss == 0) {
586             if (rrrr != 0xf)
587                 break;      // end of block
588             k += 16;    // run length is 16
589             continue;
590         }
591 
592         k += rrrr;
593 
594         if (63 < k)
595             return ERROR.data;
596         result[dezigzag[k]] = cast(short) (dc.receive_and_extend(ssss, e) * qtable[k]);
597         k += 1;
598     } while (k < 64);
599 
600     return 0;
601 }
602 
603 int receive_and_extend(ref JPEGDecoder dc, in ubyte s, ref ubyte e)
604 {
605     // receive
606     int symbol = 0;
607     foreach (_; 0..s)
608         symbol = (symbol << 1) + nextbit(dc, e);
609     // extend
610     int vt = 1 << (s-1);
611     if (symbol < vt)
612         return symbol + (-1 << s) + 1;
613     return symbol;
614 }
615 
616 // F.16 -- the DECODE
617 ubyte decode_huff(ref JPEGDecoder dc, in ref HuffTab tab, ref ubyte e)
618 {
619     short code = nextbit(dc, e);
620 
621     int i = 0;
622     while (tab.maxcode[i] < code) {
623         code = cast(short) ((code << 1) + nextbit(dc, e));
624         i += 1;
625         if (i >= tab.maxcode.length) {
626             e = ERROR.data;
627             return 0;
628         }
629     }
630     const uint j = cast(uint) (tab.valptr[i] + code - tab.mincode[i]);
631     if (j >= tab.values.length) {
632         e = ERROR.data;
633         return 0;
634     }
635     return tab.values[j];
636 }
637 
638 // F.2.2.5 and F.18
639 ubyte nextbit(ref JPEGDecoder dc, ref ubyte e)
640 {
641     if (!dc.bits_left) {
642         dc.cb = read_u8(dc.rc);
643         dc.bits_left = 8;
644 
645         if (dc.cb == 0xff) {
646             if (read_u8(dc.rc) != 0x0) {
647                 e = dc.rc.fail ? ERROR.stream : ERROR.data; // unexpected marker
648                 return 0;
649             }
650         }
651         if (dc.rc.fail) e = ERROR.stream;
652     }
653 
654     ubyte r = dc.cb >> 7;
655     dc.cb <<= 1;
656     dc.bits_left -= 1;
657     return r;
658 }
659 
660 ubyte[] reconstruct(in ref JPEGDecoder dc, ref ubyte e)
661 {
662     ubyte[] result = new_buffer(dc.width * dc.height * dc.tchans, e);
663     if (e) return null;
664 
665     switch (dc.num_comps * 10 + dc.tchans) {
666         case 34, 33:
667             // Use specialized bilinear filtering functions for the frequent cases
668             // where Cb & Cr channels have half resolution.
669             if (dc.comps[0].sfx <= 2 && dc.comps[0].sfy <= 2 &&
670                 dc.comps[0].sfx + dc.comps[0].sfy >= 3       &&
671                 dc.comps[1].sfx == 1 && dc.comps[1].sfy == 1 &&
672                 dc.comps[2].sfx == 1 && dc.comps[2].sfy == 1)
673             {
674                 void function(in ubyte[], in ubyte[], ubyte[]) nothrow resample;
675                 switch (dc.comps[0].sfx * 10 + dc.comps[0].sfy) {
676                     case 22: resample = &upsample_h2_v2; break;
677                     case 21: resample = &upsample_h2_v1; break;
678                     case 12: resample = &upsample_h1_v2; break;
679                     default: assert(0);
680                 }
681 
682                 ubyte[] comps1n2 = new_buffer(dc.width * 2, e);
683                 if (e) return null;
684                 scope(exit) _free(comps1n2.ptr);
685                 ubyte[] comp1 = comps1n2[0..dc.width];
686                 ubyte[] comp2 = comps1n2[dc.width..$];
687 
688                 size_t s = 0;
689                 size_t di = 0;
690                 foreach (j; 0 .. dc.height) {
691                     const size_t mi = j / dc.comps[0].sfy;
692                     const size_t si = (mi == 0 || mi >= (dc.height-1)/dc.comps[0].sfy)
693                                     ? mi : mi - 1 + s * 2;
694                     s = s ^ 1;
695 
696                     const size_t cs = dc.num_mcu_x * dc.comps[1].sfx * 8;
697                     const size_t cl0 = mi * cs;
698                     const size_t cl1 = si * cs;
699                     resample(dc.comps[1].data[cl0 .. cl0 + dc.comps[1].x],
700                              dc.comps[1].data[cl1 .. cl1 + dc.comps[1].x],
701                              comp1[]);
702                     resample(dc.comps[2].data[cl0 .. cl0 + dc.comps[2].x],
703                              dc.comps[2].data[cl1 .. cl1 + dc.comps[2].x],
704                              comp2[]);
705 
706                     foreach (i; 0 .. dc.width) {
707                         result[di .. di+3] = ycbcr_to_rgb(
708                             dc.comps[0].data[j * dc.num_mcu_x * dc.comps[0].sfx * 8 + i],
709                             comp1[i],
710                             comp2[i],
711                         );
712                         if (dc.tchans == 4)
713                             result[di+3] = 255;
714                         di += dc.tchans;
715                     }
716                 }
717 
718                 return result;
719             }
720 
721             foreach (const ref comp; dc.comps[0..dc.num_comps]) {
722                 if (comp.sfx != dc.hmax || comp.sfy != dc.vmax)
723                     return dc.upsample(result);
724             }
725 
726             size_t si, di;
727             foreach (j; 0 .. dc.height) {
728                 foreach (i; 0 .. dc.width) {
729                     result[di .. di+3] = ycbcr_to_rgb(
730                         dc.comps[0].data[si+i],
731                         dc.comps[1].data[si+i],
732                         dc.comps[2].data[si+i],
733                     );
734                     if (dc.tchans == 4)
735                         result[di+3] = 255;
736                     di += dc.tchans;
737                 }
738                 si += dc.num_mcu_x * dc.comps[0].sfx * 8;
739             }
740             return result;
741         case 32, 12, 31, 11:
742             const comp = &dc.comps[0];
743             if (comp.sfx == dc.hmax && comp.sfy == dc.vmax) {
744                 size_t si, di;
745                 if (dc.tchans == 2) {
746                     foreach (j; 0 .. dc.height) {
747                         foreach (i; 0 .. dc.width) {
748                             result[di++] = comp.data[si+i];
749                             result[di++] = 255;
750                         }
751                         si += dc.num_mcu_x * comp.sfx * 8;
752                     }
753                 } else {
754                     foreach (j; 0 .. dc.height) {
755                         result[di .. di+dc.width] = comp.data[si .. si+dc.width];
756                         si += dc.num_mcu_x * comp.sfx * 8;
757                         di += dc.width;
758                     }
759                 }
760                 return result;
761             } else {
762                 // need to resample (haven't tested this...)
763                 return dc.upsample_luma(result);
764             }
765         case 14, 13:
766             const comp = &dc.comps[0];
767             size_t si, di;
768             foreach (j; 0 .. dc.height) {
769                 foreach (i; 0 .. dc.width) {
770                     result[di .. di+3] = comp.data[si+i];
771                     if (dc.tchans == 4)
772                         result[di+3] = 255;
773                     di += dc.tchans;
774                 }
775                 si += dc.num_mcu_x * comp.sfx * 8;
776             }
777             return result;
778         default:
779             assert(0);
780     }
781 }
782 
783 void upsample_h2_v2(in ubyte[] line0, in ubyte[] line1, ubyte[] result)
784 {
785     ubyte mix(ubyte mm, ubyte ms, ubyte sm, ubyte ss)
786     {
787         return cast(ubyte) (( cast(uint) mm * 3 * 3
788                             + cast(uint) ms * 3 * 1
789                             + cast(uint) sm * 1 * 3
790                             + cast(uint) ss * 1 * 1
791                             + 8) / 16);
792     }
793 
794     result[0] = cast(ubyte) (( cast(uint) line0[0] * 3
795                              + cast(uint) line1[0] * 1
796                              + 2) / 4);
797     if (line0.length == 1)
798         return;
799     result[1] = mix(line0[0], line0[1], line1[0], line1[1]);
800 
801     size_t di = 2;
802     foreach (i; 1 .. line0.length) {
803         result[di] = mix(line0[i], line0[i-1], line1[i], line1[i-1]);
804         di += 1;
805         if (i == line0.length-1) {
806             if (di < result.length) {
807                 result[di] = cast(ubyte) (( cast(uint) line0[i] * 3
808                                           + cast(uint) line1[i] * 1
809                                           + 2) / 4);
810             }
811             return;
812         }
813         result[di] = mix(line0[i], line0[i+1], line1[i], line1[i+1]);
814         di += 1;
815     }
816 }
817 
818 void upsample_h2_v1(in ubyte[] line0, in ubyte[] _line1, ubyte[] result)
819 {
820     result[0] = line0[0];
821     if (line0.length == 1)
822         return;
823     result[1] = cast(ubyte) (( cast(uint) line0[0] * 3
824                              + cast(uint) line0[1] * 1
825                              + 2) / 4);
826     size_t di = 2;
827     foreach (i; 1 .. line0.length) {
828         result[di] = cast(ubyte) (( cast(uint) line0[i-1] * 1
829                                   + cast(uint) line0[i+0] * 3
830                                   + 2) / 4);
831         di += 1;
832         if (i == line0.length-1) {
833             if (di < result.length) result[di] = line0[i];
834             return;
835         }
836         result[di] = cast(ubyte) (( cast(uint) line0[i+0] * 3
837                                   + cast(uint) line0[i+1] * 1
838                                   + 2) / 4);
839         di += 1;
840     }
841 }
842 
843 void upsample_h1_v2(in ubyte[] line0, in ubyte[] line1, ubyte[] result)
844 {
845     foreach (i; 0 .. result.length) {
846         result[i] = cast(ubyte) (( cast(uint) line0[i] * 3
847                                  + cast(uint) line1[i] * 1
848                                  + 2) / 4);
849     }
850 }
851 
852 // Nearest neighbor
853 ubyte[] upsample_luma(in ref JPEGDecoder dc, ubyte[] result)
854 {
855     const size_t stride0 = dc.num_mcu_x * dc.comps[0].sfx * 8;
856     const y_step0 = cast(float) dc.comps[0].sfy / cast(float) dc.vmax;
857     const x_step0 = cast(float) dc.comps[0].sfx / cast(float) dc.hmax;
858 
859     float y0 = y_step0 * 0.5;
860     size_t y0i = 0;
861 
862     size_t di;
863 
864     foreach (j; 0 .. dc.height) {
865         float x0 = x_step0 * 0.5;
866         size_t x0i = 0;
867         foreach (i; 0 .. dc.width) {
868             result[di] = dc.comps[0].data[y0i + x0i];
869             if (dc.tchans == 2)
870                 result[di+1] = 255;
871             di += dc.tchans;
872             x0 += x_step0;
873             if (x0 >= 1.0) {
874                 x0 -= 1.0;
875                 x0i += 1;
876             }
877         }
878         y0 += y_step0;
879         if (y0 >= 1.0) {
880             y0 -= 1.0;
881             y0i += stride0;
882         }
883     }
884     return result;
885 }
886 
887 // Nearest neighbor
888 ubyte[] upsample(in ref JPEGDecoder dc, ubyte[] result)
889 {
890     const size_t stride0 = dc.num_mcu_x * dc.comps[0].sfx * 8;
891     const size_t stride1 = dc.num_mcu_x * dc.comps[1].sfx * 8;
892     const size_t stride2 = dc.num_mcu_x * dc.comps[2].sfx * 8;
893 
894     const y_step0 = cast(float) dc.comps[0].sfy / cast(float) dc.vmax;
895     const y_step1 = cast(float) dc.comps[1].sfy / cast(float) dc.vmax;
896     const y_step2 = cast(float) dc.comps[2].sfy / cast(float) dc.vmax;
897     const x_step0 = cast(float) dc.comps[0].sfx / cast(float) dc.hmax;
898     const x_step1 = cast(float) dc.comps[1].sfx / cast(float) dc.hmax;
899     const x_step2 = cast(float) dc.comps[2].sfx / cast(float) dc.hmax;
900 
901     float y0 = y_step0 * 0.5;
902     float y1 = y_step1 * 0.5;
903     float y2 = y_step2 * 0.5;
904     size_t y0i = 0;
905     size_t y1i = 0;
906     size_t y2i = 0;
907 
908     size_t di;
909 
910     foreach (_j; 0 .. dc.height) {
911         float x0 = x_step0 * 0.5;
912         float x1 = x_step1 * 0.5;
913         float x2 = x_step2 * 0.5;
914         size_t x0i = 0;
915         size_t x1i = 0;
916         size_t x2i = 0;
917         foreach (i; 0 .. dc.width) {
918             result[di .. di+3] = ycbcr_to_rgb(
919                 dc.comps[0].data[y0i + x0i],
920                 dc.comps[1].data[y1i + x1i],
921                 dc.comps[2].data[y2i + x2i],
922             );
923             if (dc.tchans == 4)
924                 result[di+3] = 255;
925             di += dc.tchans;
926             x0 += x_step0;
927             x1 += x_step1;
928             x2 += x_step2;
929             if (x0 >= 1.0) { x0 -= 1.0; x0i += 1; }
930             if (x1 >= 1.0) { x1 -= 1.0; x1i += 1; }
931             if (x2 >= 1.0) { x2 -= 1.0; x2i += 1; }
932         }
933         y0 += y_step0;
934         y1 += y_step1;
935         y2 += y_step2;
936         if (y0 >= 1.0) { y0 -= 1.0; y0i += stride0; }
937         if (y1 >= 1.0) { y1 -= 1.0; y1i += stride1; }
938         if (y2 >= 1.0) { y2 -= 1.0; y2i += stride2; }
939     }
940     return result;
941 }
942 
943 ubyte[3] ycbcr_to_rgb(in ubyte y, in ubyte cb, in ubyte cr) pure {
944     ubyte[3] rgb = void;
945     rgb[0] = clamp(y + 1.402*(cr-128));
946     rgb[1] = clamp(y - 0.34414*(cb-128) - 0.71414*(cr-128));
947     rgb[2] = clamp(y + 1.772*(cb-128));
948     return rgb;
949 }
950 
951 ubyte clamp(in float x) pure {
952     if (x < 0) return 0;
953     if (255 < x) return 255;
954     return cast(ubyte) x;
955 }
956 
957 // ------------------------------------------------------------
958 // The IDCT stuff here (to the next dashed line) is copied and adapted from
959 // stb_image which is released under public domain.  Many thanks to stb_image
960 // author, Sean Barrett.
961 // Link: https://github.com/nothings/stb/blob/master/stb_image.h
962 
963 pure int f2f(float x) { return cast(int) (x * 4096 + 0.5); }
964 pure int fsh(int x) { return x << 12; }
965 
966 // from stb_image, derived from jidctint -- DCT_ISLOW
967 pure void STBI__IDCT_1D(ref int t0, ref int t1, ref int t2, ref int t3,
968                         ref int x0, ref int x1, ref int x2, ref int x3,
969         int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
970 {
971    int p1,p2,p3,p4,p5;
972    //int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3;
973    p2 = s2;
974    p3 = s6;
975    p1 = (p2+p3) * f2f(0.5411961f);
976    t2 = p1 + p3 * f2f(-1.847759065f);
977    t3 = p1 + p2 * f2f( 0.765366865f);
978    p2 = s0;
979    p3 = s4;
980    t0 = fsh(p2+p3);
981    t1 = fsh(p2-p3);
982    x0 = t0+t3;
983    x3 = t0-t3;
984    x1 = t1+t2;
985    x2 = t1-t2;
986    t0 = s7;
987    t1 = s5;
988    t2 = s3;
989    t3 = s1;
990    p3 = t0+t2;
991    p4 = t1+t3;
992    p1 = t0+t3;
993    p2 = t1+t2;
994    p5 = (p3+p4)*f2f( 1.175875602f);
995    t0 = t0*f2f( 0.298631336f);
996    t1 = t1*f2f( 2.053119869f);
997    t2 = t2*f2f( 3.072711026f);
998    t3 = t3*f2f( 1.501321110f);
999    p1 = p5 + p1*f2f(-0.899976223f);
1000    p2 = p5 + p2*f2f(-2.562915447f);
1001    p3 = p3*f2f(-1.961570560f);
1002    p4 = p4*f2f(-0.390180644f);
1003    t3 += p1+p4;
1004    t2 += p2+p3;
1005    t1 += p2+p4;
1006    t0 += p1+p3;
1007 }
1008 
1009 // idct and level-shift
1010 pure void stbi__idct_block(ubyte* dst, in int dst_stride, in ref short[64] data)
1011 {
1012    int i;
1013    int[64] val;
1014    int* v = val.ptr;
1015    const(short)* d = data.ptr;
1016 
1017    // columns
1018    for (i=0; i < 8; ++i,++d, ++v) {
1019       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
1020       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
1021            && d[40]==0 && d[48]==0 && d[56]==0) {
1022          //    no shortcut                 0     seconds
1023          //    (1|2|3|4|5|6|7)==0          0     seconds
1024          //    all separate               -0.047 seconds
1025          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
1026          int dcterm = d[0] << 2;
1027          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
1028       } else {
1029          int t0,t1,t2,t3,x0,x1,x2,x3;
1030          STBI__IDCT_1D(
1031              t0, t1, t2, t3,
1032              x0, x1, x2, x3,
1033              d[ 0], d[ 8], d[16], d[24],
1034              d[32], d[40], d[48], d[56]
1035          );
1036          // constants scaled things up by 1<<12; let's bring them back
1037          // down, but keep 2 extra bits of precision
1038          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
1039          v[ 0] = (x0+t3) >> 10;
1040          v[56] = (x0-t3) >> 10;
1041          v[ 8] = (x1+t2) >> 10;
1042          v[48] = (x1-t2) >> 10;
1043          v[16] = (x2+t1) >> 10;
1044          v[40] = (x2-t1) >> 10;
1045          v[24] = (x3+t0) >> 10;
1046          v[32] = (x3-t0) >> 10;
1047       }
1048    }
1049 
1050    ubyte* o = dst;
1051    for (i=0, v=val.ptr; i < 8; ++i,v+=8,o+=dst_stride) {
1052       // no fast case since the first 1D IDCT spread components out
1053       int t0,t1,t2,t3,x0,x1,x2,x3;
1054       STBI__IDCT_1D(
1055           t0, t1, t2, t3,
1056           x0, x1, x2, x3,
1057           v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]
1058       );
1059       // constants scaled things up by 1<<12, plus we had 1<<2 from first
1060       // loop, plus horizontal and vertical each scale by sqrt(8) so together
1061       // we've got an extra 1<<3, so 1<<17 total we need to remove.
1062       // so we want to round that, which means adding 0.5 * 1<<17,
1063       // aka 65536. Also, we'll end up with -128 to 127 that we want
1064       // to encode as 0-255 by adding 128, so we'll add that before the shift
1065       x0 += 65536 + (128<<17);
1066       x1 += 65536 + (128<<17);
1067       x2 += 65536 + (128<<17);
1068       x3 += 65536 + (128<<17);
1069       // tried computing the shifts into temps, or'ing the temps to see
1070       // if any were out of range, but that was slower
1071       o[0] = stbi__clamp((x0+t3) >> 17);
1072       o[7] = stbi__clamp((x0-t3) >> 17);
1073       o[1] = stbi__clamp((x1+t2) >> 17);
1074       o[6] = stbi__clamp((x1-t2) >> 17);
1075       o[2] = stbi__clamp((x2+t1) >> 17);
1076       o[5] = stbi__clamp((x2-t1) >> 17);
1077       o[3] = stbi__clamp((x3+t0) >> 17);
1078       o[4] = stbi__clamp((x3-t0) >> 17);
1079    }
1080 }
1081 
1082 // clamp to 0-255
1083 pure ubyte stbi__clamp(int x) {
1084    if (cast(uint) x > 255) {
1085       if (x < 0) return 0;
1086       if (x > 255) return 255;
1087    }
1088    return cast(ubyte) x;
1089 }
1090 
1091 // ------------------------------------------------------------