1 // Copyright 2019 Tero Hänninen. All rights reserved.
2 // SPDX-License-Identifier: BSD-2-Clause
3 module imagefmt.jpeg;
4 
5 import std.math : ceil;
6 import imagefmt;
7 
8 @nogc nothrow package:
9 
10 struct JPEGDecoder {
11     Reader* rc;
12 
13     ubyte[64][4] qtables;
14     HuffTab[2] ac_tables;
15     HuffTab[2] dc_tables;
16 
17     int bits_left;   // num of unused bits in cb
18     ubyte cb;  // current byte (next bit always at MSB)
19 
20     bool has_frame_header = false;
21     bool eoi_reached = false;
22 
23     bool correct_comp_ids;
24     ubyte[] compsbuf;
25     Component[3] comps;
26     ubyte num_comps;
27     int tchans;
28 
29     int width;
30     int height;
31     int hmax;
32     int vmax;
33     int num_mcu_x;
34     int num_mcu_y;
35 
36     ushort restart_interval;    // number of MCUs in restart interval
37 }
38 
39 // image component
40 struct Component {
41     size_t x;       // total num of samples, without fill samples
42     size_t y;       // total num of samples, without fill samples
43     ubyte[] data;   // reconstructed samples
44     int pred;       // dc prediction
45     ubyte sfx;      // sampling factor, aka. h
46     ubyte sfy;      // sampling factor, aka. v
47     ubyte qtable;
48     ubyte ac_table;
49     ubyte dc_table;
50 }
51 
52 struct HuffTab {
53     ubyte[256] values;
54     ubyte[257] sizes;
55     short[16] mincode;
56     short[16] maxcode;
57     short[16] valptr;
58 }
59 
60 enum MARKER : ubyte {
61     SOI = 0xd8,     // start of image
62     SOF0 = 0xc0,    // start of frame / baseline DCT
63     //SOF1 = 0xc1,    // start of frame / extended seq.
64     //SOF2 = 0xc2,    // start of frame / progressive DCT
65     SOF3 = 0xc3,    // start of frame / lossless
66     SOF9 = 0xc9,    // start of frame / extended seq., arithmetic
67     SOF11 = 0xcb,    // start of frame / lossless, arithmetic
68     DHT = 0xc4,     // define huffman tables
69     DQT = 0xdb,     // define quantization tables
70     DRI = 0xdd,     // define restart interval
71     SOS = 0xda,     // start of scan
72     DNL = 0xdc,     // define number of lines
73     RST0 = 0xd0,    // restart entropy coded data
74     // ...
75     RST7 = 0xd7,    // restart entropy coded data
76     APP0 = 0xe0,    // application 0 segment
77     // ...
78     APPf = 0xef,    // application f segment
79     //DAC = 0xcc,     // define arithmetic conditioning table
80     COM = 0xfe,     // comment
81     EOI = 0xd9,     // end of image
82 }
83 
84 bool detect_jpeg(Reader* rc)
85 {
86     auto info = read_jpeg_info(rc);
87     reset2start(rc);
88     return info.e == 0;
89 }
90 
91 IFInfo infoerror(ubyte e)
92 {
93     IFInfo info = { e: e };
94     return info;
95 }
96 
97 IFInfo read_jpeg_info(Reader* rc)
98 {
99     if (read_u8(rc) != 0xff || read_u8(rc) != MARKER.SOI)
100        return infoerror(rc.fail ? ERROR.stream : ERROR.data);
101 
102     while (true) {
103         if (read_u8(rc) != 0xff)
104             return infoerror(rc.fail ? ERROR.stream : ERROR.data);
105 
106         ubyte marker = read_u8(rc);
107         while (marker == 0xff && !rc.fail)
108             marker = read_u8(rc);
109 
110         if (rc.fail)
111             return infoerror(ERROR.stream);
112 
113         switch (marker) with (MARKER) {
114             case SOF0: .. case SOF3:
115             case SOF9: .. case SOF11:
116                 skip(rc, 3); // len + some byte
117                 IFInfo info;
118                 info.h = read_u16be(rc);
119                 info.w = read_u16be(rc);
120                 info.c = read_u8(rc);
121                 info.e = rc.fail ? ERROR.stream : 0;
122                 return info;
123             case SOS, EOI:
124                 return infoerror(ERROR.data);
125             case DRI, DHT, DQT, COM:
126             case APP0: .. case APPf:
127                 int len = read_u16be(rc) - 2;
128                 skip(rc, len);
129                 break;
130             default:
131                 return infoerror(ERROR.unsupp);
132         }
133     }
134     assert(0);
135 }
136 
137 ubyte read_jpeg(Reader* rc, out IFImage image, in int reqchans, in int reqbpc)
138 {
139     if (cast(uint) reqchans > 4)
140         return ERROR.arg;
141     const ubyte tbpc = cast(ubyte) (reqbpc ? reqbpc : 8);
142     if (tbpc != 8 && tbpc != 16)
143         return ERROR.unsupp;
144     if (read_u8(rc) != 0xff || read_u8(rc) != MARKER.SOI)
145        return rc.fail ? ERROR.stream : ERROR.data;
146     if (rc.fail)
147         return ERROR.stream;
148 
149     JPEGDecoder dc = { rc: rc };
150 
151     ubyte e = read_markers(dc);   // reads until first scan header or eoi
152 
153     if (e) return e;
154     if (dc.eoi_reached) return ERROR.data;
155 
156     dc.tchans = reqchans == 0 ? dc.num_comps : reqchans;
157 
158     if (cast(ulong) dc.width * dc.height * dc.tchans > MAXIMUM_IMAGE_SIZE)
159         return ERROR.bigimg;
160 
161     {
162         size_t[3] csizes;
163         size_t acc;
164         foreach (i, ref comp; dc.comps[0..dc.num_comps]) {
165             csizes[i] = dc.num_mcu_x * comp.sfx*8 * dc.num_mcu_y * comp.sfy*8;
166             acc += csizes[i];
167         }
168         dc.compsbuf = new_buffer(acc, e);
169         if (e) return e;
170         acc = 0;
171         foreach (i, ref comp; dc.comps[0..dc.num_comps]) {
172             comp.data = dc.compsbuf[acc .. acc + csizes[i]];
173             acc += csizes[i];
174         }
175     }
176     scope(exit)
177         _free(dc.compsbuf.ptr);
178 
179     // E.7 -- Multiple scans are for progressive images which are not supported
180     //while (!dc.eoi_reached) {
181         e = decode_scan(dc);    // E.2.3
182         //read_markers(dc);   // reads until next scan header or eoi
183     //}
184     if (e) return e;
185 
186     // throw away fill samples and convert to target format
187     ubyte[] buf = dc.reconstruct(e);
188     if (e) return e;
189 
190     image.w   = dc.width;
191     image.h   = dc.height;
192     image.c   = cast(ubyte) dc.tchans;
193     image.cinfile = dc.num_comps;
194     image.bpc = tbpc;
195     if (tbpc == 8) {
196         image.buf8 = buf;
197     } else {
198         image.buf16 = bpc8to16(buf);
199         if (!image.buf16.ptr)
200             return ERROR.oom;
201     }
202     return 0;
203 }
204 
205 ubyte read_markers(ref JPEGDecoder dc)
206 {
207     bool has_next_scan_header = false;
208     while (!has_next_scan_header && !dc.eoi_reached) {
209         if (read_u8(dc.rc) != 0xff)
210             return dc.rc.fail ? ERROR.stream : ERROR.data;
211 
212         ubyte marker = read_u8(dc.rc);
213         while (marker == 0xff && !dc.rc.fail)
214             marker = read_u8(dc.rc);
215 
216         if (dc.rc.fail)
217             return ERROR.stream;
218 
219         ubyte e;
220         switch (marker) with (MARKER) {
221             case DHT:
222                 e = read_huffman_tables(dc);
223                 break;
224             case DQT:
225                 e = read_quantization_tables(dc);
226                 break;
227             case SOF0:
228                 if (dc.has_frame_header)
229                     return ERROR.data;
230                 e = read_frame_header(dc);
231                 dc.has_frame_header = true;
232                 break;
233             case SOS:
234                 if (!dc.has_frame_header)
235                     return ERROR.data;
236                 e = read_scan_header(dc);
237                 has_next_scan_header = true;
238                 break;
239             case DRI:
240                 if (read_u16be(dc.rc) != 4)  // len
241                     return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
242                 dc.restart_interval = read_u16be(dc.rc);
243                 break;
244             case EOI:
245                 dc.eoi_reached = true;
246                 break;
247             case APP0: .. case APPf:
248             case COM:
249                 const int len = read_u16be(dc.rc) - 2;
250                 skip(dc.rc, len);
251                 break;
252             default:
253                 return ERROR.unsupp;
254         }
255         if (e)
256             return dc.rc.fail ? ERROR.stream : e;
257     }
258     return 0;
259 }
260 
261 // DHT -- define huffman tables
262 ubyte read_huffman_tables(ref JPEGDecoder dc)
263 {
264     ubyte[17] tmp;
265     int len = read_u16be(dc.rc) - 2;
266     if (dc.rc.fail) return ERROR.stream;
267 
268     ubyte e;
269     while (len > 0) {
270         read_block(dc.rc, tmp[0..17]);        // info byte & the BITS
271         if (dc.rc.fail) return ERROR.stream;
272         const ubyte tableslot  = tmp[0] & 0x0f; // must be 0 or 1 for baseline
273         const ubyte tableclass = tmp[0] >> 4;   // 0 = dc table, 1 = ac table
274         if (tableslot > 1 || tableclass > 1)
275             return ERROR.unsupp;
276 
277         // compute total number of huffman codes
278         int mt = 0;
279         foreach (i; 1..17)
280             mt += tmp[i];
281         if (256 < mt)
282             return ERROR.data;
283 
284         if (tableclass == 0) {
285             read_block(dc.rc, dc.dc_tables[tableslot].values[0..mt]);
286             derive_table(dc.dc_tables[tableslot], tmp[1..17], e);
287         } else {
288             read_block(dc.rc, dc.ac_tables[tableslot].values[0..mt]);
289             derive_table(dc.ac_tables[tableslot], tmp[1..17], e);
290         }
291 
292         len -= 17 + mt;
293     }
294     if (dc.rc.fail) return ERROR.stream;
295     return e;
296 }
297 
298 // num_values is the BITS
299 void derive_table(ref HuffTab table, in ref ubyte[16] num_values, ref ubyte e)
300 {
301     short[256] codes;
302 
303     int k = 0;
304     foreach (i; 0..16) {
305         foreach (j; 0..num_values[i]) {
306             if (k > table.sizes.length) {
307                 e = ERROR.data;
308                 return;
309             }
310             table.sizes[k] = cast(ubyte) (i + 1);
311             ++k;
312         }
313     }
314     table.sizes[k] = 0;
315 
316     k = 0;
317     short code = 0;
318     ubyte si = table.sizes[k];
319     while (true) {
320         do {
321             codes[k] = code;
322             ++code;
323             ++k;
324         } while (si == table.sizes[k]);
325 
326         if (table.sizes[k] == 0)
327             break;
328 
329         assert(si < table.sizes[k]);
330         do {
331             code <<= 1;
332             ++si;
333         } while (si != table.sizes[k]);
334     }
335 
336     derive_mincode_maxcode_valptr(
337         table.mincode, table.maxcode, table.valptr,
338         codes, num_values
339     );
340 }
341 
342 // F.15
343 void derive_mincode_maxcode_valptr(ref short[16] mincode, ref short[16] maxcode,
344      ref short[16] valptr, in ref short[256] codes, in ref ubyte[16] num_values)
345 {
346     mincode[] = -1;
347     maxcode[] = -1;
348     valptr[] = -1;
349 
350     int j = 0;
351     foreach (i; 0..16) {
352         if (num_values[i] != 0) {
353             valptr[i] = cast(short) j;
354             mincode[i] = codes[j];
355             j += num_values[i] - 1;
356             maxcode[i] = codes[j];
357             j += 1;
358         }
359     }
360 }
361 
362 // DQT -- define quantization tables
363 ubyte read_quantization_tables(ref JPEGDecoder dc)
364 {
365     int len = read_u16be(dc.rc);
366     if (len % 65 != 2)
367         return dc.rc.fail ? ERROR.stream : ERROR.data;
368     len -= 2;
369     while (len > 0) {
370         const ubyte info = read_u8(dc.rc);
371         const ubyte tableslot = info & 0x0f;
372         const ubyte precision = info >> 4;  // 0 = 8 bit, 1 = 16 bit
373         if (tableslot > 3 || precision != 0) // only 8 bit for baseline
374             return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
375         read_block(dc.rc, dc.qtables[tableslot][0..64]);
376         len -= 1 + 64;
377     }
378     return dc.rc.fail ? ERROR.stream : 0;
379 }
380 
381 // SOF0 -- start of frame
382 ubyte read_frame_header(ref JPEGDecoder dc)
383 {
384     const int len = read_u16be(dc.rc);  // 8 + num_comps*3
385     const ubyte precision = read_u8(dc.rc);
386     dc.height = read_u16be(dc.rc);
387     dc.width = read_u16be(dc.rc);
388     dc.num_comps = read_u8(dc.rc);
389 
390     if ((dc.num_comps != 1 && dc.num_comps != 3)
391      || precision != 8 || len != 8 + dc.num_comps*3)
392         return ERROR.unsupp;
393 
394     dc.hmax = 0;
395     dc.vmax = 0;
396     int mcu_du = 0; // data units in one mcu
397     ubyte[9] tmp;
398     read_block(dc.rc, tmp[0..dc.num_comps*3]);
399     if (dc.rc.fail) return ERROR.stream;
400     foreach (i; 0..dc.num_comps) {
401         ubyte ci = tmp[i*3];
402         // JFIF says ci should be i+1, but there are images where ci is i. Normalize
403         // ids so that ci == i, always. So much for standards...
404         if (i == 0) { dc.correct_comp_ids = ci == i+1; }
405         if ((dc.correct_comp_ids && ci != i+1)
406         || (!dc.correct_comp_ids && ci != i))
407             return ERROR.data;
408 
409         Component* comp = &dc.comps[i];
410         const ubyte sampling_factors = tmp[i*3 + 1];
411         comp.sfx = sampling_factors >> 4;
412         comp.sfy = sampling_factors & 0xf;
413         comp.qtable = tmp[i*3 + 2];
414         if ( comp.sfy < 1 || 4 < comp.sfy ||
415              comp.sfx < 1 || 4 < comp.sfx ||
416              3 < comp.qtable )
417             return ERROR.unsupp;
418 
419         if (dc.hmax < comp.sfx) dc.hmax = comp.sfx;
420         if (dc.vmax < comp.sfy) dc.vmax = comp.sfy;
421 
422         mcu_du += comp.sfx * comp.sfy;
423     }
424     if (10 < mcu_du)
425         return ERROR.unsupp;
426 
427     assert(dc.hmax * dc.vmax);
428     foreach (i; 0..dc.num_comps) {
429         dc.comps[i].x = cast(size_t) ceil(dc.width * (cast(double) dc.comps[i].sfx /
430                     dc.hmax));
431         dc.comps[i].y = cast(size_t) ceil(dc.height * (cast(double) dc.comps[i].sfy /
432                     dc.vmax));
433     }
434 
435     size_t mcu_w = dc.hmax * 8;
436     size_t mcu_h = dc.vmax * 8;
437     dc.num_mcu_x = cast(int) ((dc.width + mcu_w-1) / mcu_w);
438     dc.num_mcu_y = cast(int) ((dc.height + mcu_h-1) / mcu_h);
439     return 0;
440 }
441 
442 // SOS -- start of scan
443 ubyte read_scan_header(ref JPEGDecoder dc)
444 {
445     const ushort len = read_u16be(dc.rc);
446     const ubyte num_scan_comps = read_u8(dc.rc);
447 
448     if ( num_scan_comps != dc.num_comps ||
449          len != (6+num_scan_comps*2) )
450         return dc.rc.fail ? ERROR.stream : ERROR.unsupp;
451 
452     ubyte[16] buf;
453     ubyte e;
454     read_block(dc.rc, buf[0..len-3]);
455     if (dc.rc.fail) return ERROR.stream;
456 
457     foreach (i; 0..num_scan_comps) {
458         const uint ci = buf[i*2] - (dc.correct_comp_ids ? 1 : 0);
459         if (ci >= dc.num_comps)
460             return ERROR.data;
461 
462         const ubyte tables = buf[i*2 + 1];
463         dc.comps[ci].dc_table = tables >> 4;
464         dc.comps[ci].ac_table = tables & 0x0f;
465         if (dc.comps[ci].dc_table > 1 || dc.comps[ci].ac_table > 1)
466             return ERROR.unsupp;
467     }
468 
469     // ignore these
470     //ubyte spectral_start = buf[$-3];
471     //ubyte spectral_end = buf[$-2];
472     //ubyte approx = buf[$-1];
473     return 0;
474 }
475 
476 // E.2.3 and E.8 and E.9
477 ubyte decode_scan(ref JPEGDecoder dc)
478 {
479     int intervals, mcus;
480     if (dc.restart_interval > 0) {
481         int total_mcus = dc.num_mcu_x * dc.num_mcu_y;
482         intervals = (total_mcus + dc.restart_interval-1) / dc.restart_interval;
483         mcus = dc.restart_interval;
484     } else {
485         intervals = 1;
486         mcus = dc.num_mcu_x * dc.num_mcu_y;
487     }
488 
489     ubyte e;
490     foreach (mcu_j; 0 .. dc.num_mcu_y) {
491         foreach (mcu_i; 0 .. dc.num_mcu_x) {
492 
493             // decode mcu
494             foreach (c; 0..dc.num_comps) {
495                 auto comp = &dc.comps[c];
496                 foreach (du_j; 0 .. comp.sfy) {
497                     foreach (du_i; 0 .. comp.sfx) {
498                         // decode entropy, dequantize & dezigzag
499                         short[64] data;
500                         e = decode_block(dc, *comp, dc.qtables[comp.qtable], data);
501                         if (e) return e;
502                         // idct & level-shift
503                         int outx = (mcu_i * comp.sfx + du_i) * 8;
504                         int outy = (mcu_j * comp.sfy + du_j) * 8;
505                         int dst_stride = dc.num_mcu_x * comp.sfx*8;
506                         ubyte* dst = comp.data.ptr + outy*dst_stride + outx;
507                         stbi__idct_block(dst, dst_stride, data);
508                     }
509                 }
510             }
511 
512             --mcus;
513 
514             if (!mcus) {
515                 --intervals;
516                 if (!intervals)
517                     return e;
518 
519                 e = read_restart(dc.rc);    // RSTx marker
520 
521                 if (intervals == 1) {
522                     // last interval, may have fewer MCUs than defined by DRI
523                     mcus = (dc.num_mcu_y - mcu_j - 1)
524                          * dc.num_mcu_x + dc.num_mcu_x - mcu_i - 1;
525                 } else {
526                     mcus = dc.restart_interval;
527                 }
528 
529                 // reset decoder
530                 dc.cb = 0;
531                 dc.bits_left = 0;
532                 foreach (k; 0..dc.num_comps)
533                     dc.comps[k].pred = 0;
534             }
535 
536         }
537     }
538     return e;
539 }
540 
541 // RST0-RST7
542 ubyte read_restart(Reader* rc) {
543     ubyte a = read_u8(rc);
544     ubyte b = read_u8(rc);
545     if (rc.fail) return ERROR.stream;
546     if (a != 0xff || b < MARKER.RST0 || b > MARKER.RST7)
547         return ERROR.data;
548     return 0;
549     // the markers should cycle 0 through 7, could check that here...
550 }
551 
552 immutable ubyte[64] dezigzag = [
553      0,  1,  8, 16,  9,  2,  3, 10,
554     17, 24, 32, 25, 18, 11,  4,  5,
555     12, 19, 26, 33, 40, 48, 41, 34,
556     27, 20, 13,  6,  7, 14, 21, 28,
557     35, 42, 49, 56, 57, 50, 43, 36,
558     29, 22, 15, 23, 30, 37, 44, 51,
559     58, 59, 52, 45, 38, 31, 39, 46,
560     53, 60, 61, 54, 47, 55, 62, 63,
561 ];
562 
563 // decode entropy, dequantize & dezigzag (see section F.2)
564 ubyte decode_block(ref JPEGDecoder dc, ref Component comp, in ref ubyte[64] qtable,
565                                                               out short[64] result)
566 {
567     ubyte e;
568     const ubyte t = decode_huff(dc, dc.dc_tables[comp.dc_table], e);
569     if (e) return e;
570     const int diff = t ? dc.receive_and_extend(t, e) : 0;
571 
572     comp.pred = comp.pred + diff;
573     result[0] = cast(short) (comp.pred * qtable[0]);
574 
575     int k = 1;
576     do {
577         ubyte rs = decode_huff(dc, dc.ac_tables[comp.ac_table], e);
578         if (e) return e;
579         ubyte rrrr = rs >> 4;
580         ubyte ssss = rs & 0xf;
581 
582         if (ssss == 0) {
583             if (rrrr != 0xf)
584                 break;      // end of block
585             k += 16;    // run length is 16
586             continue;
587         }
588 
589         k += rrrr;
590 
591         if (63 < k)
592             return ERROR.data;
593         result[dezigzag[k]] = cast(short) (dc.receive_and_extend(ssss, e) * qtable[k]);
594         k += 1;
595     } while (k < 64);
596 
597     return 0;
598 }
599 
600 int receive_and_extend(ref JPEGDecoder dc, in ubyte s, ref ubyte e)
601 {
602     // receive
603     int symbol = 0;
604     foreach (_; 0..s)
605         symbol = (symbol << 1) + nextbit(dc, e);
606     // extend
607     int vt = 1 << (s-1);
608     if (symbol < vt)
609         return symbol + (-1 << s) + 1;
610     return symbol;
611 }
612 
613 // F.16 -- the DECODE
614 ubyte decode_huff(ref JPEGDecoder dc, in ref HuffTab tab, ref ubyte e)
615 {
616     short code = nextbit(dc, e);
617 
618     int i = 0;
619     while (tab.maxcode[i] < code) {
620         code = cast(short) ((code << 1) + nextbit(dc, e));
621         i += 1;
622         if (i >= tab.maxcode.length) {
623             e = ERROR.data;
624             return 0;
625         }
626     }
627     const uint j = cast(uint) (tab.valptr[i] + code - tab.mincode[i]);
628     if (j >= tab.values.length) {
629         e = ERROR.data;
630         return 0;
631     }
632     return tab.values[j];
633 }
634 
635 // F.2.2.5 and F.18
636 ubyte nextbit(ref JPEGDecoder dc, ref ubyte e)
637 {
638     if (!dc.bits_left) {
639         dc.cb = read_u8(dc.rc);
640         dc.bits_left = 8;
641 
642         if (dc.cb == 0xff) {
643             if (read_u8(dc.rc) != 0x0) {
644                 e = dc.rc.fail ? ERROR.stream : ERROR.data; // unexpected marker
645                 return 0;
646             }
647         }
648         if (dc.rc.fail) e = ERROR.stream;
649     }
650 
651     ubyte r = dc.cb >> 7;
652     dc.cb <<= 1;
653     dc.bits_left -= 1;
654     return r;
655 }
656 
657 ubyte[] reconstruct(in ref JPEGDecoder dc, ref ubyte e)
658 {
659     ubyte[] result = new_buffer(dc.width * dc.height * dc.tchans, e);
660     if (e) return null;
661 
662     switch (dc.num_comps * 10 + dc.tchans) {
663         case 34, 33:
664             // Use specialized bilinear filtering functions for the frequent cases where
665             // Cb & Cr channels have half resolution.
666             if (dc.comps[0].sfx <= 2 && dc.comps[0].sfy <= 2 &&
667                 dc.comps[0].sfx + dc.comps[0].sfy >= 3       &&
668                 dc.comps[1].sfx == 1 && dc.comps[1].sfy == 1 &&
669                 dc.comps[2].sfx == 1 && dc.comps[2].sfy == 1)
670             {
671                 void function(in ubyte[], in ubyte[], ubyte[]) nothrow resample;
672                 switch (dc.comps[0].sfx * 10 + dc.comps[0].sfy) {
673                     case 22: resample = &upsample_h2_v2; break;
674                     case 21: resample = &upsample_h2_v1; break;
675                     case 12: resample = &upsample_h1_v2; break;
676                     default: assert(0);
677                 }
678 
679                 ubyte[] comps1n2 = new_buffer(dc.width * 2, e);
680                 if (e) return null;
681                 scope(exit) _free(comps1n2.ptr);
682                 ubyte[] comp1 = comps1n2[0..dc.width];
683                 ubyte[] comp2 = comps1n2[dc.width..$];
684 
685                 size_t s = 0;
686                 size_t di = 0;
687                 foreach (j; 0 .. dc.height) {
688                     const size_t mi = j / dc.comps[0].sfy;
689                     const size_t si = (mi == 0 || mi >= (dc.height-1)/dc.comps[0].sfy)
690                               ? mi : mi - 1 + s * 2;
691                     s = s ^ 1;
692 
693                     const size_t cs = dc.num_mcu_x * dc.comps[1].sfx * 8;
694                     const size_t cl0 = mi * cs;
695                     const size_t cl1 = si * cs;
696                     resample(dc.comps[1].data[cl0 .. cl0 + dc.comps[1].x],
697                              dc.comps[1].data[cl1 .. cl1 + dc.comps[1].x],
698                              comp1[]);
699                     resample(dc.comps[2].data[cl0 .. cl0 + dc.comps[2].x],
700                              dc.comps[2].data[cl1 .. cl1 + dc.comps[2].x],
701                              comp2[]);
702 
703                     foreach (i; 0 .. dc.width) {
704                         result[di .. di+3] = ycbcr_to_rgb(
705                             dc.comps[0].data[j * dc.num_mcu_x * dc.comps[0].sfx * 8 + i],
706                             comp1[i],
707                             comp2[i],
708                         );
709                         if (dc.tchans == 4)
710                             result[di+3] = 255;
711                         di += dc.tchans;
712                     }
713                 }
714 
715                 return result;
716             }
717 
718             foreach (const ref comp; dc.comps[0..dc.num_comps]) {
719                 if (comp.sfx != dc.hmax || comp.sfy != dc.vmax)
720                     return dc.upsample(result);
721             }
722 
723             size_t si, di;
724             foreach (j; 0 .. dc.height) {
725                 foreach (i; 0 .. dc.width) {
726                     result[di .. di+3] = ycbcr_to_rgb(
727                         dc.comps[0].data[si+i],
728                         dc.comps[1].data[si+i],
729                         dc.comps[2].data[si+i],
730                     );
731                     if (dc.tchans == 4)
732                         result[di+3] = 255;
733                     di += dc.tchans;
734                 }
735                 si += dc.num_mcu_x * dc.comps[0].sfx * 8;
736             }
737             return result;
738         case 32, 12, 31, 11:
739             const comp = &dc.comps[0];
740             if (comp.sfx == dc.hmax && comp.sfy == dc.vmax) {
741                 size_t si, di;
742                 if (dc.tchans == 2) {
743                     foreach (j; 0 .. dc.height) {
744                         foreach (i; 0 .. dc.width) {
745                             result[di++] = comp.data[si+i];
746                             result[di++] = 255;
747                         }
748                         si += dc.num_mcu_x * comp.sfx * 8;
749                     }
750                 } else {
751                     foreach (j; 0 .. dc.height) {
752                         result[di .. di+dc.width] = comp.data[si .. si+dc.width];
753                         si += dc.num_mcu_x * comp.sfx * 8;
754                         di += dc.width;
755                     }
756                 }
757                 return result;
758             } else {
759                 // need to resample (haven't tested this...)
760                 return dc.upsample_luma(result);
761             }
762         case 14, 13:
763             const comp = &dc.comps[0];
764             size_t si, di;
765             foreach (j; 0 .. dc.height) {
766                 foreach (i; 0 .. dc.width) {
767                     result[di .. di+3] = comp.data[si+i];
768                     if (dc.tchans == 4)
769                         result[di+3] = 255;
770                     di += dc.tchans;
771                 }
772                 si += dc.num_mcu_x * comp.sfx * 8;
773             }
774             return result;
775         default:
776             assert(0);
777     }
778 }
779 
780 void upsample_h2_v2(in ubyte[] line0, in ubyte[] line1, ubyte[] result)
781 {
782     ubyte mix(ubyte mm, ubyte ms, ubyte sm, ubyte ss)
783     {
784         return cast(ubyte) (( cast(uint) mm * 3 * 3
785                             + cast(uint) ms * 3 * 1
786                             + cast(uint) sm * 1 * 3
787                             + cast(uint) ss * 1 * 1
788                             + 8) / 16);
789     }
790 
791     result[0] = cast(ubyte) (( cast(uint) line0[0] * 3
792                              + cast(uint) line1[0] * 1
793                              + 2) / 4);
794     if (line0.length == 1)
795         return;
796     result[1] = mix(line0[0], line0[1], line1[0], line1[1]);
797 
798     size_t di = 2;
799     foreach (i; 1 .. line0.length) {
800         result[di] = mix(line0[i], line0[i-1], line1[i], line1[i-1]);
801         di += 1;
802         if (i == line0.length-1) {
803             if (di < result.length) {
804                 result[di] = cast(ubyte) (( cast(uint) line0[i] * 3
805                                           + cast(uint) line1[i] * 1
806                                           + 2) / 4);
807             }
808             return;
809         }
810         result[di] = mix(line0[i], line0[i+1], line1[i], line1[i+1]);
811         di += 1;
812     }
813 }
814 
815 void upsample_h2_v1(in ubyte[] line0, in ubyte[] _line1, ubyte[] result)
816 {
817     result[0] = line0[0];
818     if (line0.length == 1)
819         return;
820     result[1] = cast(ubyte) (( cast(uint) line0[0] * 3
821                              + cast(uint) line0[1] * 1
822                              + 2) / 4);
823     size_t di = 2;
824     foreach (i; 1 .. line0.length) {
825         result[di] = cast(ubyte) (( cast(uint) line0[i-1] * 1
826                                   + cast(uint) line0[i+0] * 3
827                                   + 2) / 4);
828         di += 1;
829         if (i == line0.length-1) {
830             if (di < result.length) result[di] = line0[i];
831             return;
832         }
833         result[di] = cast(ubyte) (( cast(uint) line0[i+0] * 3
834                                   + cast(uint) line0[i+1] * 1
835                                   + 2) / 4);
836         di += 1;
837     }
838 }
839 
840 void upsample_h1_v2(in ubyte[] line0, in ubyte[] line1, ubyte[] result)
841 {
842     foreach (i; 0 .. result.length) {
843         result[i] = cast(ubyte) (( cast(uint) line0[i] * 3
844                                  + cast(uint) line1[i] * 1
845                                  + 2) / 4);
846     }
847 }
848 
849 // Nearest neighbor
850 ubyte[] upsample_luma(in ref JPEGDecoder dc, ubyte[] result)
851 {
852     const size_t stride0 = dc.num_mcu_x * dc.comps[0].sfx * 8;
853     const y_step0 = cast(float) dc.comps[0].sfy / cast(float) dc.vmax;
854     const x_step0 = cast(float) dc.comps[0].sfx / cast(float) dc.hmax;
855 
856     float y0 = y_step0 * 0.5;
857     size_t y0i = 0;
858 
859     size_t di;
860 
861     foreach (j; 0 .. dc.height) {
862         float x0 = x_step0 * 0.5;
863         size_t x0i = 0;
864         foreach (i; 0 .. dc.width) {
865             result[di] = dc.comps[0].data[y0i + x0i];
866             if (dc.tchans == 2)
867                 result[di+1] = 255;
868             di += dc.tchans;
869             x0 += x_step0;
870             if (x0 >= 1.0) {
871                 x0 -= 1.0;
872                 x0i += 1;
873             }
874         }
875         y0 += y_step0;
876         if (y0 >= 1.0) {
877             y0 -= 1.0;
878             y0i += stride0;
879         }
880     }
881     return result;
882 }
883 
884 // Nearest neighbor
885 ubyte[] upsample(in ref JPEGDecoder dc, ubyte[] result)
886 {
887     const size_t stride0 = dc.num_mcu_x * dc.comps[0].sfx * 8;
888     const size_t stride1 = dc.num_mcu_x * dc.comps[1].sfx * 8;
889     const size_t stride2 = dc.num_mcu_x * dc.comps[2].sfx * 8;
890 
891     const y_step0 = cast(float) dc.comps[0].sfy / cast(float) dc.vmax;
892     const y_step1 = cast(float) dc.comps[1].sfy / cast(float) dc.vmax;
893     const y_step2 = cast(float) dc.comps[2].sfy / cast(float) dc.vmax;
894     const x_step0 = cast(float) dc.comps[0].sfx / cast(float) dc.hmax;
895     const x_step1 = cast(float) dc.comps[1].sfx / cast(float) dc.hmax;
896     const x_step2 = cast(float) dc.comps[2].sfx / cast(float) dc.hmax;
897 
898     float y0 = y_step0 * 0.5;
899     float y1 = y_step1 * 0.5;
900     float y2 = y_step2 * 0.5;
901     size_t y0i = 0;
902     size_t y1i = 0;
903     size_t y2i = 0;
904 
905     size_t di;
906 
907     foreach (_j; 0 .. dc.height) {
908         float x0 = x_step0 * 0.5;
909         float x1 = x_step1 * 0.5;
910         float x2 = x_step2 * 0.5;
911         size_t x0i = 0;
912         size_t x1i = 0;
913         size_t x2i = 0;
914         foreach (i; 0 .. dc.width) {
915             result[di .. di+3] = ycbcr_to_rgb(
916                 dc.comps[0].data[y0i + x0i],
917                 dc.comps[1].data[y1i + x1i],
918                 dc.comps[2].data[y2i + x2i],
919             );
920             if (dc.tchans == 4)
921                 result[di+3] = 255;
922             di += dc.tchans;
923             x0 += x_step0;
924             x1 += x_step1;
925             x2 += x_step2;
926             if (x0 >= 1.0) { x0 -= 1.0; x0i += 1; }
927             if (x1 >= 1.0) { x1 -= 1.0; x1i += 1; }
928             if (x2 >= 1.0) { x2 -= 1.0; x2i += 1; }
929         }
930         y0 += y_step0;
931         y1 += y_step1;
932         y2 += y_step2;
933         if (y0 >= 1.0) { y0 -= 1.0; y0i += stride0; }
934         if (y1 >= 1.0) { y1 -= 1.0; y1i += stride1; }
935         if (y2 >= 1.0) { y2 -= 1.0; y2i += stride2; }
936     }
937     return result;
938 }
939 
940 ubyte[3] ycbcr_to_rgb(in ubyte y, in ubyte cb, in ubyte cr) pure {
941     ubyte[3] rgb = void;
942     rgb[0] = clamp(y + 1.402*(cr-128));
943     rgb[1] = clamp(y - 0.34414*(cb-128) - 0.71414*(cr-128));
944     rgb[2] = clamp(y + 1.772*(cb-128));
945     return rgb;
946 }
947 
948 ubyte clamp(in float x) pure {
949     if (x < 0) return 0;
950     if (255 < x) return 255;
951     return cast(ubyte) x;
952 }
953 
954 // ------------------------------------------------------------
955 // The IDCT stuff here (to the next dashed line) is copied and adapted from
956 // stb_image which is released under public domain.  Many thanks to stb_image
957 // author, Sean Barrett.
958 // Link: https://github.com/nothings/stb/blob/master/stb_image.h
959 
960 pure int f2f(float x) { return cast(int) (x * 4096 + 0.5); }
961 pure int fsh(int x) { return x << 12; }
962 
963 // from stb_image, derived from jidctint -- DCT_ISLOW
964 pure void STBI__IDCT_1D(ref int t0, ref int t1, ref int t2, ref int t3,
965                         ref int x0, ref int x1, ref int x2, ref int x3,
966         int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
967 {
968    int p1,p2,p3,p4,p5;
969    //int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3;
970    p2 = s2;
971    p3 = s6;
972    p1 = (p2+p3) * f2f(0.5411961f);
973    t2 = p1 + p3 * f2f(-1.847759065f);
974    t3 = p1 + p2 * f2f( 0.765366865f);
975    p2 = s0;
976    p3 = s4;
977    t0 = fsh(p2+p3);
978    t1 = fsh(p2-p3);
979    x0 = t0+t3;
980    x3 = t0-t3;
981    x1 = t1+t2;
982    x2 = t1-t2;
983    t0 = s7;
984    t1 = s5;
985    t2 = s3;
986    t3 = s1;
987    p3 = t0+t2;
988    p4 = t1+t3;
989    p1 = t0+t3;
990    p2 = t1+t2;
991    p5 = (p3+p4)*f2f( 1.175875602f);
992    t0 = t0*f2f( 0.298631336f);
993    t1 = t1*f2f( 2.053119869f);
994    t2 = t2*f2f( 3.072711026f);
995    t3 = t3*f2f( 1.501321110f);
996    p1 = p5 + p1*f2f(-0.899976223f);
997    p2 = p5 + p2*f2f(-2.562915447f);
998    p3 = p3*f2f(-1.961570560f);
999    p4 = p4*f2f(-0.390180644f);
1000    t3 += p1+p4;
1001    t2 += p2+p3;
1002    t1 += p2+p4;
1003    t0 += p1+p3;
1004 }
1005 
1006 // idct and level-shift
1007 pure void stbi__idct_block(ubyte* dst, in int dst_stride, in ref short[64] data)
1008 {
1009    int i;
1010    int[64] val;
1011    int* v = val.ptr;
1012    const(short)* d = data.ptr;
1013 
1014    // columns
1015    for (i=0; i < 8; ++i,++d, ++v) {
1016       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
1017       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
1018            && d[40]==0 && d[48]==0 && d[56]==0) {
1019          //    no shortcut                 0     seconds
1020          //    (1|2|3|4|5|6|7)==0          0     seconds
1021          //    all separate               -0.047 seconds
1022          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
1023          int dcterm = d[0] << 2;
1024          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
1025       } else {
1026          int t0,t1,t2,t3,x0,x1,x2,x3;
1027          STBI__IDCT_1D(
1028              t0, t1, t2, t3,
1029              x0, x1, x2, x3,
1030              d[ 0], d[ 8], d[16], d[24],
1031              d[32], d[40], d[48], d[56]
1032          );
1033          // constants scaled things up by 1<<12; let's bring them back
1034          // down, but keep 2 extra bits of precision
1035          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
1036          v[ 0] = (x0+t3) >> 10;
1037          v[56] = (x0-t3) >> 10;
1038          v[ 8] = (x1+t2) >> 10;
1039          v[48] = (x1-t2) >> 10;
1040          v[16] = (x2+t1) >> 10;
1041          v[40] = (x2-t1) >> 10;
1042          v[24] = (x3+t0) >> 10;
1043          v[32] = (x3-t0) >> 10;
1044       }
1045    }
1046 
1047    ubyte* o = dst;
1048    for (i=0, v=val.ptr; i < 8; ++i,v+=8,o+=dst_stride) {
1049       // no fast case since the first 1D IDCT spread components out
1050       int t0,t1,t2,t3,x0,x1,x2,x3;
1051       STBI__IDCT_1D(
1052           t0, t1, t2, t3,
1053           x0, x1, x2, x3,
1054           v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]
1055       );
1056       // constants scaled things up by 1<<12, plus we had 1<<2 from first
1057       // loop, plus horizontal and vertical each scale by sqrt(8) so together
1058       // we've got an extra 1<<3, so 1<<17 total we need to remove.
1059       // so we want to round that, which means adding 0.5 * 1<<17,
1060       // aka 65536. Also, we'll end up with -128 to 127 that we want
1061       // to encode as 0-255 by adding 128, so we'll add that before the shift
1062       x0 += 65536 + (128<<17);
1063       x1 += 65536 + (128<<17);
1064       x2 += 65536 + (128<<17);
1065       x3 += 65536 + (128<<17);
1066       // tried computing the shifts into temps, or'ing the temps to see
1067       // if any were out of range, but that was slower
1068       o[0] = stbi__clamp((x0+t3) >> 17);
1069       o[7] = stbi__clamp((x0-t3) >> 17);
1070       o[1] = stbi__clamp((x1+t2) >> 17);
1071       o[6] = stbi__clamp((x1-t2) >> 17);
1072       o[2] = stbi__clamp((x2+t1) >> 17);
1073       o[5] = stbi__clamp((x2-t1) >> 17);
1074       o[3] = stbi__clamp((x3+t0) >> 17);
1075       o[4] = stbi__clamp((x3-t0) >> 17);
1076    }
1077 }
1078 
1079 // clamp to 0-255
1080 pure ubyte stbi__clamp(int x) {
1081    if (cast(uint) x > 255) {
1082       if (x < 0) return 0;
1083       if (x > 255) return 255;
1084    }
1085    return cast(ubyte) x;
1086 }
1087 
1088 // ------------------------------------------------------------