807 row = wasm_i64x2_const(0, 0);
808 w0 = wasm_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N);
810 flags = wasm_v128_and(w0, wasm_i32x4_const(0x1110,0x2220,0x4440,0x8880));
811 insig = wasm_i32x4_eq(flags, wasm_i64x2_const(0, 0));
812 if (wasm_i8x16_bitmask(insig) != 0xFFFF)
814 U_q = wasm_i32x4_shuffle(U_q, U_q, N, N, N, N);
815 flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,8,4,4,2,2,1,1));
816 v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
824 w0 = wasm_u32x4_shr(flags, 15);
825 m_n = wasm_i32x4_sub(U_q, w0);
826 m_n = wasm_v128_andnot(m_n, insig);
830 v128_t ex_sum, shfl, inc_sum = m_n;
831 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
832 inc_sum = wasm_i32x4_add(inc_sum, shfl);
833 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
834 inc_sum = wasm_i32x4_add(inc_sum, shfl);
835 int total_mn = wasm_u16x8_extract_lane(inc_sum, 6);
836 ex_sum = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
839 v128_t byte_idx = wasm_u32x4_shr(ex_sum, 3);
841 wasm_v128_and(ex_sum, wasm_i32x4_const(
OJPH_REPEAT4(7)));
842 byte_idx = wasm_i8x16_swizzle(byte_idx,
843 wasm_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C));
845 wasm_i32x4_add(byte_idx, wasm_i32x4_const(
OJPH_REPEAT4(0x03020100)));
846 v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
848 wasm_i32x4_add(byte_idx, wasm_i32x4_const(
OJPH_REPEAT4(0x01010101)));
849 v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
852 bit_idx = wasm_v128_or(bit_idx, wasm_i32x4_shl(bit_idx, 16));
853 v128_t bit_shift = wasm_i8x16_swizzle(
854 wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
855 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
857 wasm_i16x8_add(bit_shift, wasm_i16x8_const(
OJPH_REPEAT8(0x0101)));
858 d0 = wasm_i16x8_mul(d0, bit_shift);
859 d0 = wasm_u16x8_shr(d0, 8);
860 d1 = wasm_i16x8_mul(d1, bit_shift);
862 wasm_v128_and(d1, wasm_u32x4_const(
OJPH_REPEAT4(0xFF00FF00)));
863 d0 = wasm_v128_or(d0, d1);
869 ui32 U_q_m1 = wasm_u32x4_extract_lane(U_q, 0) - 1u;
870 w0 = wasm_i32x4_sub(twos, w0);
871 shift = wasm_i32x4_shl(w0, U_q_m1);
872 ms_vec = wasm_v128_and(d0, wasm_i32x4_sub(shift, ones));
875 w0 = wasm_v128_and(flags, wasm_i32x4_const(
OJPH_REPEAT4(0x800)));
876 w0 = wasm_i32x4_eq(w0, wasm_i64x2_const(0, 0));
877 w0 = wasm_v128_andnot(shift, w0);
878 ms_vec = wasm_v128_or(ms_vec, w0);
879 w0 = wasm_i32x4_shl(ms_vec, 31);
880 ms_vec = wasm_v128_or(ms_vec, ones);
882 ms_vec = wasm_i32x4_add(ms_vec, twos);
883 ms_vec = wasm_i32x4_shl(ms_vec, p - 1);
884 ms_vec = wasm_v128_or(ms_vec, w0);
885 row = wasm_v128_andnot(ms_vec, insig);
887 ms_vec = wasm_v128_andnot(tvn, insig);
889 tvn = wasm_i8x16_swizzle(ms_vec,
890 wasm_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1));
892 tvn = wasm_i8x16_swizzle(ms_vec,
893 wasm_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1));
896 vn = wasm_v128_or(vn, tvn);
923 row = wasm_i64x2_const(0, 0);
924 w0 = wasm_i8x16_swizzle(inf_u_q,
925 wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
926 0x0504, 0x0504, 0x0504, 0x0504));
928 flags = wasm_v128_and(w0,
929 wasm_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880,
930 0x1110, 0x2220, 0x4440, 0x8880));
931 insig = wasm_i16x8_eq(flags, wasm_i64x2_const(0, 0));
932 if (wasm_i8x16_bitmask(insig) != 0xFFFF)
934 U_q = wasm_i8x16_swizzle(U_q,
935 wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
936 0x0504, 0x0504, 0x0504, 0x0504));
937 flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,4,2,1,8,4,2,1));
938 v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
946 w0 = wasm_u16x8_shr(flags, 15);
947 m_n = wasm_i16x8_sub(U_q, w0);
948 m_n = wasm_v128_andnot(m_n, insig);
952 v128_t ex_sum, shfl, inc_sum = m_n;
953 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
954 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
955 inc_sum = wasm_i16x8_add(inc_sum, shfl);
956 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
957 inc_sum = wasm_i16x8_add(inc_sum, shfl);
958 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
959 inc_sum = wasm_i16x8_add(inc_sum, shfl);
960 int total_mn = wasm_u16x8_extract_lane(inc_sum, 7);
961 ex_sum = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
962 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
965 v128_t byte_idx = wasm_u16x8_shr(ex_sum, 3);
967 wasm_v128_and(ex_sum, wasm_i16x8_const(
OJPH_REPEAT8(7)));
968 byte_idx = wasm_i8x16_swizzle(byte_idx,
969 wasm_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606,
970 0x0808, 0x0A0A, 0x0C0C, 0x0E0E));
972 wasm_i16x8_add(byte_idx, wasm_i16x8_const(
OJPH_REPEAT8(0x0100)));
973 v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
975 wasm_i16x8_add(byte_idx, wasm_i16x8_const(
OJPH_REPEAT8(0x0101)));
976 v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
979 v128_t bit_shift = wasm_i8x16_swizzle(
980 wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
981 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
983 wasm_i16x8_add(bit_shift, wasm_i16x8_const(
OJPH_REPEAT8(0x0101)));
984 d0 = wasm_i16x8_mul(d0, bit_shift);
985 d0 = wasm_u16x8_shr(d0, 8);
986 d1 = wasm_i16x8_mul(d1, bit_shift);
989 d0 = wasm_v128_or(d0, d1);
992 v128_t shift, t0, t1;
995 v128_t U_q_m1 = wasm_i32x4_sub(U_q, ones);
996 ui32 Uq0 = wasm_u16x8_extract_lane(U_q_m1, 0);
997 ui32 Uq1 = wasm_u16x8_extract_lane(U_q_m1, 4);
998 w0 = wasm_i16x8_sub(twos, w0);
999 t0 = wasm_v128_and(w0, wasm_i64x2_const(-1, 0));
1000 t1 = wasm_v128_and(w0, wasm_i64x2_const(0, -1));
1001 t0 = wasm_i32x4_shl(t0, Uq0);
1002 t1 = wasm_i32x4_shl(t1, Uq1);
1003 shift = wasm_v128_or(t0, t1);
1004 ms_vec = wasm_v128_and(d0, wasm_i16x8_sub(shift, ones));
1007 w0 = wasm_v128_and(flags, wasm_i16x8_const(
OJPH_REPEAT8(0x800)));
1008 w0 = wasm_i16x8_eq(w0, wasm_i64x2_const(0, 0));
1009 w0 = wasm_v128_andnot(shift, w0);
1010 ms_vec = wasm_v128_or(ms_vec, w0);
1011 w0 = wasm_i16x8_shl(ms_vec, 15);
1012 ms_vec = wasm_v128_or(ms_vec, ones);
1013 v128_t tvn = ms_vec;
1014 ms_vec = wasm_i16x8_add(ms_vec, twos);
1015 ms_vec = wasm_i16x8_shl(ms_vec, p - 1);
1016 ms_vec = wasm_v128_or(ms_vec, w0);
1017 row = wasm_v128_andnot(ms_vec, insig);
1019 ms_vec = wasm_v128_andnot(tvn, insig);
1020 w0 = wasm_i8x16_swizzle(ms_vec,
1021 wasm_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1));
1022 vn = wasm_v128_or(vn, w0);
1023 w0 = wasm_i8x16_swizzle(ms_vec,
1024 wasm_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1));
1025 vn = wasm_v128_or(vn, w0);
1052 ui32 missing_msbs,
ui32 num_passes,
1057 static bool insufficient_precision =
false;
1058 static bool modify_code =
false;
1059 static bool truncate_spp_mrp =
false;
1061 if (num_passes > 1 && lengths2 == 0)
1063 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1064 "one coding pass, but zero length for "
1065 "2nd and potential 3rd pass.\n");
1071 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1072 "This codeblocks has %d passes.\n",
1077 if (missing_msbs > 30)
1079 if (insufficient_precision ==
false)
1081 insufficient_precision =
true;
1082 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1083 "codeblock. This message will not be "
1084 "displayed again.\n");
1088 else if (missing_msbs == 30)
1090 if (modify_code ==
false) {
1092 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1093 "pass. The code can be modified to support "
1094 "this case. This message will not be "
1095 "displayed again.\n");
1099 else if (missing_msbs == 29)
1101 if (num_passes > 1) {
1103 if (truncate_spp_mrp ==
false) {
1104 truncate_spp_mrp =
true;
1105 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1106 "nor MagRef passes; both will be skipped. "
1107 "This message will not be displayed "
1112 ui32 p = 30 - missing_msbs;
1118 OJPH_WARN(0x00010006,
"Wrong codeblock length.\n");
1124 lcup = (int)lengths1;
1126 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1127 if (scup < 2 || scup > lcup || scup > 4079)
1145 ui16 scratch[8 * 513] = {0};
1153 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1155 assert((stride & 0x3) == 0);
1157 ui32 mmsbp2 = missing_msbs + 2;
1169 mel_init(&mel, coded_data, lcup, scup);
1171 rev_init(&vlc, coded_data, lcup, scup);
1181 for (
ui32 x = 0; x < width; sp += 4)
1200 t0 = (run == -1) ? t0 : 0;
1214 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1223 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1226 if (c_q == 0 && x < width)
1231 t1 = (run == -1) ? t1 : 0;
1236 t1 = x < width ? t1 : 0;
1245 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1253 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1254 if (uvlc_mode == 0xc0)
1258 uvlc_mode += (run == -1) ? 0x40 : 0;
1275 ui32 len = uvlc_entry & 0xF;
1276 ui32 tmp = vlc_val & ((1 << len) - 1);
1280 len = uvlc_entry & 0x7;
1282 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1284 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1290 for (
ui32 y = 2; y < height; y += 2)
1293 ui16 *sp = scratch + (y >> 1) * sstr;
1295 for (
ui32 x = 0; x < width; sp += 4)
1301 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1302 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1318 t0 = (run == -1) ? t0 : 0;
1333 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1335 c_q |= sp[0 - (
si32)sstr] & 0x80;
1337 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1338 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1347 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1350 if (c_q == 0 && x < width)
1355 t1 = (run == -1) ? t1 : 0;
1360 t1 = x < width ? t1 : 0;
1370 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1372 c_q |= sp[2 - (
si32)sstr] & 0x80;
1380 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1386 ui32 len = uvlc_entry & 0xF;
1387 ui32 tmp = vlc_val & ((1 << len) - 1);
1391 len = uvlc_entry & 0x7;
1393 ui16 u_q = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len)));
1395 u_q = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1418 const int v_n_size = 512 + 8;
1419 ui32 v_n_scratch[2 * v_n_size] = {0};
1422 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1426 ui32 *vp = v_n_scratch;
1427 ui32 *dp = decoded_data;
1430 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1434 v128_t inf_u_q, U_q;
1437 inf_u_q = wasm_v128_load(sp);
1438 U_q = wasm_u32x4_shr(inf_u_q, 16);
1440 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1441 ui32 i = wasm_i8x16_bitmask(w0);
1447 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1448 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1449 w0 = wasm_v128_load(vp);
1450 w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1451 w0 = wasm_v128_or(w0, vn);
1452 wasm_v128_store(vp, w0);
1456 w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1457 w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1458 row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1459 row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1460 wasm_v128_store(dp, row0);
1461 wasm_v128_store(dp + stride, row1);
1465 for (
ui32 y = 2; y < height; y += 2)
1469 ui32 *vp = v_n_scratch;
1470 const v128_t lut_lo = wasm_i8x16_const(
1471 31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1473 const v128_t lut_hi = wasm_i8x16_const(
1474 31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1476 const v128_t nibble_mask = wasm_i8x16_const(
OJPH_REPEAT16(0x0F));
1477 const v128_t byte_offset8 = wasm_i16x8_const(
OJPH_REPEAT8(8));
1478 const v128_t byte_offset16 = wasm_i16x8_const(
OJPH_REPEAT8(16));
1480 for (
ui32 x = 0; x <= width; x += 8, vp += 4)
1483 v = wasm_v128_load(vp);
1485 t = wasm_v128_and(nibble_mask, v);
1486 v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1487 t = wasm_i8x16_swizzle(lut_lo, t);
1488 v = wasm_i8x16_swizzle(lut_hi, v);
1489 v = wasm_u8x16_min(v, t);
1491 t = wasm_u16x8_shr(v, 8);
1492 v = wasm_v128_or(v, byte_offset8);
1493 v = wasm_u8x16_min(v, t);
1495 t = wasm_u32x4_shr(v, 16);
1496 v = wasm_v128_or(v, byte_offset16);
1497 v = wasm_u8x16_min(v, t);
1499 v = wasm_i16x8_sub(cc, v);
1500 wasm_v128_store(vp + v_n_size, v);
1504 ui32 *vp = v_n_scratch;
1505 ui16 *sp = scratch + (y >> 1) * sstr;
1506 ui32 *dp = decoded_data + y * stride;
1509 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1513 v128_t inf_u_q, U_q;
1516 v128_t gamma, emax, kappa, u_q;
1518 inf_u_q = wasm_v128_load(sp);
1520 wasm_v128_and(inf_u_q, wasm_i32x4_const(
OJPH_REPEAT4(0xF0)));
1521 w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(
OJPH_REPEAT4(1)));
1522 gamma = wasm_v128_and(gamma, w0);
1523 gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1525 emax = wasm_v128_load(vp + v_n_size);
1526 w0 = wasm_i32x4_shuffle(emax, wasm_i64x2_const(0,0), 1, 2, 3, 4);
1527 emax = wasm_i16x8_max(w0, emax);
1528 emax = wasm_v128_andnot(emax, gamma);
1531 kappa = wasm_i16x8_max(emax, kappa);
1533 u_q = wasm_u32x4_shr(inf_u_q, 16);
1534 U_q = wasm_i32x4_add(u_q, kappa);
1536 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1537 ui32 i = wasm_i8x16_bitmask(w0);
1543 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1544 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1545 w0 = wasm_v128_load(vp);
1546 w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1547 w0 = wasm_v128_or(w0, vn);
1548 wasm_v128_store(vp, w0);
1551 w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1552 w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1553 row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1554 row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1555 wasm_v128_store(dp, row0);
1556 wasm_v128_store(dp + stride, row1);
1571 const int v_n_size = 512 + 8;
1572 ui16 v_n_scratch[2 * v_n_size] = {0};
1575 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1579 ui16 *vp = v_n_scratch;
1580 ui32 *dp = decoded_data;
1583 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1587 v128_t inf_u_q, U_q;
1590 inf_u_q = wasm_v128_load(sp);
1591 U_q = wasm_u32x4_shr(inf_u_q, 16);
1593 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1594 ui32 i = wasm_i8x16_bitmask(w0);
1601 w0 = wasm_v128_load(vp);
1602 w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1603 w0 = wasm_v128_or(w0, vn);
1604 wasm_v128_store(vp, w0);
1607 w0 = wasm_i8x16_swizzle(row,
1608 wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1609 -1, 0x0908, -1, 0x0D0C));
1610 wasm_v128_store(dp, w0);
1611 w1 = wasm_i8x16_swizzle(row,
1612 wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1613 -1, 0x0B0A, -1, 0x0F0E));
1614 wasm_v128_store(dp + stride, w1);
1618 for (
ui32 y = 2; y < height; y += 2)
1622 ui16 *vp = v_n_scratch;
1623 const v128_t lut_lo = wasm_i8x16_const(
1624 15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1626 const v128_t lut_hi = wasm_i8x16_const(
1627 15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1629 const v128_t nibble_mask = wasm_i8x16_const(
OJPH_REPEAT16(0x0F));
1630 const v128_t byte_offset8 = wasm_i16x8_const(
OJPH_REPEAT8(8));
1632 for (
ui32 x = 0; x <= width; x += 16, vp += 8)
1635 v = wasm_v128_load(vp);
1637 t = wasm_v128_and(nibble_mask, v);
1638 v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1639 t = wasm_i8x16_swizzle(lut_lo, t);
1640 v = wasm_i8x16_swizzle(lut_hi, v);
1641 v = wasm_u8x16_min(v, t);
1643 t = wasm_u16x8_shr(v, 8);
1644 v = wasm_v128_or(v, byte_offset8);
1645 v = wasm_u8x16_min(v, t);
1647 v = wasm_i16x8_sub(cc, v);
1648 wasm_v128_store(vp + v_n_size, v);
1652 ui16 *vp = v_n_scratch;
1653 ui16 *sp = scratch + (y >> 1) * sstr;
1654 ui32 *dp = decoded_data + y * stride;
1657 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1661 v128_t inf_u_q, U_q;
1664 v128_t gamma, emax, kappa, u_q;
1666 inf_u_q = wasm_v128_load(sp);
1668 wasm_v128_and(inf_u_q, wasm_i32x4_const(
OJPH_REPEAT4(0xF0)));
1669 w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(
OJPH_REPEAT4(1)));
1670 gamma = wasm_v128_and(gamma, w0);
1671 gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1673 emax = wasm_v128_load(vp + v_n_size);
1674 w0 = wasm_i16x8_shuffle(emax,
1675 wasm_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8);
1676 emax = wasm_i16x8_max(w0, emax);
1677 emax = wasm_i8x16_swizzle(emax,
1678 wasm_i16x8_const(0x0100, -1, 0x0302, -1,
1679 0x0504, -1, 0x0706, -1));
1680 emax = wasm_v128_andnot(emax, gamma);
1683 kappa = wasm_i16x8_max(emax, kappa);
1685 u_q = wasm_u32x4_shr(inf_u_q, 16);
1686 U_q = wasm_i32x4_add(u_q, kappa);
1688 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1689 ui32 i = wasm_i8x16_bitmask(w0);
1696 w0 = wasm_v128_load(vp);
1697 w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1698 w0 = wasm_v128_or(w0, vn);
1699 wasm_v128_store(vp, w0);
1701 w0 = wasm_i8x16_swizzle(row,
1702 wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1703 -1, 0x0908, -1, 0x0D0C));
1704 wasm_v128_store(dp, w0);
1705 w1 = wasm_i8x16_swizzle(row,
1706 wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1707 -1, 0x0B0A, -1, 0x0F0E));
1708 wasm_v128_store(dp + stride, w1);
1722 ui16*
const sigma = scratch;
1724 ui32 mstr = (width + 3u) >> 2;
1726 mstr = ((mstr + 2u) + 7u) & ~7u;
1734 const v128_t mask_3 = wasm_i32x4_const(
OJPH_REPEAT4(0x30));
1735 const v128_t mask_C = wasm_i32x4_const(
OJPH_REPEAT4(0xC0));
1736 const v128_t shuffle_mask = wasm_i32x4_const(0x0C080400,-1,-1,-1);
1737 for (y = 0; y < height; y += 4)
1739 ui16* sp = scratch + (y >> 1) * sstr;
1740 ui16* dp = sigma + (y >> 2) * mstr;
1741 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1743 v128_t s0, s1, u3, uC, t0, t1;
1745 s0 = wasm_v128_load(sp);
1746 u3 = wasm_v128_and(s0, mask_3);
1747 u3 = wasm_u32x4_shr(u3, 4);
1748 uC = wasm_v128_and(s0, mask_C);
1749 uC = wasm_u32x4_shr(uC, 2);
1750 t0 = wasm_v128_or(u3, uC);
1752 s1 = wasm_v128_load(sp + sstr);
1753 u3 = wasm_v128_and(s1, mask_3);
1754 u3 = wasm_u32x4_shr(u3, 2);
1755 uC = wasm_v128_and(s1, mask_C);
1756 t1 = wasm_v128_or(u3, uC);
1758 v128_t r = wasm_v128_or(t0, t1);
1759 r = wasm_i8x16_swizzle(r, shuffle_mask);
1761 wasm_v128_store32_lane(dp, r, 0);
1767 ui16* dp = sigma + (y >> 2) * mstr;
1768 v128_t zero = wasm_i64x2_const(0, 0);
1769 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1770 wasm_v128_store(dp, zero);
1786 ui16 prev_row_sig[256 + 8] = {0};
1789 frwd_init<0>(&sigprop, coded_data + lengths1, (
int)lengths2);
1791 for (
ui32 y = 0; y < height; y += 4)
1793 ui32 pattern = 0xFFFFu;
1794 if (height - y < 4) {
1796 if (height - y < 3) {
1806 ui16 *prev_sig = prev_row_sig;
1807 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1808 ui32 *dpp = decoded_data + y * stride;
1809 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1814 pattern = pattern >> (s * 4);
1829 ui32 ns = *(
ui32*)(cur_sig + mstr);
1830 ui32 u = (ps & 0x88888888) >> 3;
1832 u |= (ns & 0x11111111) << 3;
1837 mbr |= (cs & 0x77777777) << 1;
1838 mbr |= (cs & 0xEEEEEEEE) >> 1;
1854 v128_t cwd_vec = frwd_fetch<0>(&sigprop);
1855 ui32 cwd = wasm_u32x4_extract_lane(cwd_vec, 0);
1858 ui32 col_mask = 0xFu;
1859 ui32 inv_sig = ~cs & pattern;
1860 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1862 if ((col_mask & new_sig) == 0)
1866 ui32 sample_mask = 0x1111u & col_mask;
1867 if (new_sig & sample_mask)
1869 new_sig &= ~sample_mask;
1872 ui32 t = 0x33u << i;
1873 new_sig |= t & inv_sig;
1879 if (new_sig & sample_mask)
1881 new_sig &= ~sample_mask;
1884 ui32 t = 0x76u << i;
1885 new_sig |= t & inv_sig;
1891 if (new_sig & sample_mask)
1893 new_sig &= ~sample_mask;
1896 ui32 t = 0xECu << i;
1897 new_sig |= t & inv_sig;
1903 if (new_sig & sample_mask)
1905 new_sig &= ~sample_mask;
1908 ui32 t = 0xC8u << i;
1909 new_sig |= t & inv_sig;
1919 v128_t new_sig_vec = wasm_i16x8_splat((
si16)new_sig);
1920 new_sig_vec = wasm_i8x16_swizzle(new_sig_vec,
1921 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1922 new_sig_vec = wasm_v128_and(new_sig_vec,
1924 new_sig_vec = wasm_i8x16_eq(new_sig_vec,
1929 v128_t ex_sum, shfl, inc_sum = new_sig_vec;
1930 inc_sum = wasm_i8x16_abs(inc_sum);
1931 shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1932 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1933 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1934 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
1935 7, 8, 9, 10, 11, 12, 13, 14);
1936 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1937 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
1939 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1940 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
1942 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1943 cnt += wasm_u8x16_extract_lane(inc_sum, 15);
1945 ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1946 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1950 cwd_vec = wasm_i16x8_splat((
si16)cwd);
1951 cwd_vec = wasm_i8x16_swizzle(cwd_vec,
1952 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1953 cwd_vec = wasm_v128_and(cwd_vec,
1955 cwd_vec = wasm_i8x16_eq(cwd_vec,
1957 cwd_vec = wasm_i8x16_abs(cwd_vec);
1961 v128_t v = wasm_i8x16_swizzle(cwd_vec, ex_sum);
1964 v128_t m = wasm_i8x16_const(
1965 0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1);
1966 v128_t val = wasm_i32x4_splat(3 << (p - 2));
1968 for (
int c = 0; c < 4; ++ c) {
1969 v128_t s0, s0_ns, s0_val;
1971 s0 = wasm_v128_load(dp);
1975 s0_ns = wasm_i8x16_swizzle(new_sig_vec, m);
1976 s0_ns = wasm_i32x4_eq(s0_ns,
1980 s0_val = wasm_i8x16_swizzle(v, m);
1981 s0_val = wasm_i32x4_shl(s0_val, 31);
1982 s0_val = wasm_v128_or(s0_val, val);
1983 s0_val = wasm_v128_and(s0_val, s0_ns);
1986 s0 = wasm_v128_or(s0, s0_val);
1988 wasm_v128_store(dp, s0);
1991 m = wasm_i32x4_add(m, wasm_i32x4_const(
OJPH_REPEAT4(1)));
1998 *prev_sig = (
ui16)(new_sig);
2002 new_sig |= (t & 0x7777) << 1;
2003 new_sig |= (t & 0xEEEE) >> 1;
2016 rev_init_mrp(&magref, coded_data, (
int)lengths1, (
int)lengths2);
2018 for (
ui32 y = 0; y < height; y += 4)
2020 ui16 *cur_sig = sigma + (y >> 2) * mstr;
2021 ui32 *dpp = decoded_data + y * stride;
2022 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
2027 ui16 sig = *cur_sig++;
2035 v128_t sig_vec = wasm_i16x8_splat((
si16)sig);
2036 sig_vec = wasm_i8x16_swizzle(sig_vec,
2037 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2038 sig_vec = wasm_v128_and(sig_vec,
2040 sig_vec = wasm_i8x16_eq(sig_vec,
2042 sig_vec = wasm_i8x16_abs(sig_vec);
2046 v128_t ex_sum, shfl, inc_sum = sig_vec;
2047 shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2048 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2049 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2050 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
2051 7, 8, 9, 10, 11, 12, 13, 14);
2052 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2053 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
2055 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2056 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
2058 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2059 total_bits = wasm_u8x16_extract_lane(inc_sum, 15);
2061 ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2062 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2069 v128_t cwd_vec = wasm_i16x8_splat((
si16)cwd);
2070 cwd_vec = wasm_i8x16_swizzle(cwd_vec,
2071 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2072 cwd_vec = wasm_v128_and(cwd_vec,
2074 cwd_vec = wasm_i8x16_eq(cwd_vec,
2077 wasm_i8x16_add(cwd_vec, wasm_i8x16_const(
OJPH_REPEAT16(1)));
2078 cwd_vec = wasm_i8x16_add(cwd_vec, cwd_vec);
2083 v128_t m = wasm_i8x16_const(0,-1,-1,-1,4,-1,-1,-1,
2084 8,-1,-1,-1,12,-1,-1,-1);
2086 for (
int c = 0; c < 4; ++c) {
2087 v128_t s0, s0_sig, s0_idx, s0_val;
2089 s0 = wasm_v128_load(dp);
2091 s0_sig = wasm_i8x16_swizzle(sig_vec, m);
2092 s0_sig = wasm_i8x16_eq(s0_sig, wasm_i64x2_const(0, 0));
2094 s0_idx = wasm_i8x16_swizzle(ex_sum, m);
2095 s0_val = wasm_i8x16_swizzle(cwd_vec, s0_idx);
2097 s0_val = wasm_v128_andnot(s0_val, s0_sig);
2099 s0_val = wasm_i32x4_shl(s0_val, p - 2);
2100 s0 = wasm_v128_xor(s0, s0_val);
2102 wasm_v128_store(dp, s0);
2105 m = wasm_i32x4_add(m, wasm_i32x4_const(
OJPH_REPEAT4(1)));