Key frame temporal filtering

Added key frame temporal filtering. Enabled it for VOD encoding with encoder speed < 2. Minor improvement in prediction. Added the restriction of using no more than "arnr_max_frames" frames for temporal filtering. Key frame temporal filtering is turned off by default for now. To enable it, set "--enable-keyframe-filtering=1" Borg result with "--enable-keyframe-filtering=1" avg_psnr: ovr_psnr: ssim: vmaf: hdres2: -0.762 -0.863 -0.903 -0.680 midres2: -0.813 -0.753 -0.757 -0.743 lowres2: -0.492 -0.598 -0.737 -0.881 The impact on the encoder time is minimal. Change-Id: If6abea3e21efcb96f1978cd9dfaa742c40dc2a56
webmproject · Aug 19, 2024 · a5ea71f · a5ea71f
1 parent 5d20cc3
commit a5ea71f
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 42 deletions.
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
@@ -1042,7 +1042,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free_frame_buffer(&cpi->last_frame_uf);
   vpx_free_frame_buffer(&cpi->scaled_source);
   vpx_free_frame_buffer(&cpi->scaled_last_source);
-  vpx_free_frame_buffer(&cpi->alt_ref_buffer);
+  vpx_free_frame_buffer(&cpi->tf_buffer);
 #ifdef ENABLE_KF_DENOISE
   vpx_free_frame_buffer(&cpi->raw_unscaled_source);
   vpx_free_frame_buffer(&cpi->raw_scaled_source);
@@ -1299,15 +1299,15 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
                        "Failed to allocate lag buffers");
 
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
-  if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+  if (vpx_realloc_frame_buffer(&cpi->tf_buffer, oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
                                VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
                                NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
+                       "Failed to allocate temporal filter buffer");
 }
 
 static void alloc_util_frame_buffers(VP9_COMP *cpi) {
@@ -6460,7 +6460,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #endif
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
-        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+        vpx_extend_frame_borders(&cpi->tf_buffer);
 #if CONFIG_COLLECT_COMPONENT_TIMING
         end_timing(cpi, vp9_temporal_filter_time);
 #endif
@@ -6470,7 +6470,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame)
           vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi);
 
-        force_src_buffer = &cpi->alt_ref_buffer;
+        force_src_buffer = &cpi->tf_buffer;
       }
 #endif
       cm->show_frame = 0;
@@ -6587,6 +6587,26 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
+
+  // Key frame temporal filtering
+  const int is_key_temporal_filter_enabled =
+      oxcf->enable_keyframe_filtering && cpi->oxcf.mode != REALTIME &&
+      (oxcf->pass != 1) && !cpi->use_svc &&
+      !is_lossless_requested(&cpi->oxcf) && cm->frame_type == KEY_FRAME &&
+      (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0) &&
+      cpi->oxcf.speed < 2;
+  // Save the pointer to the original source image.
+  YV12_BUFFER_CONFIG *source_buffer = cpi->un_scaled_source;
+
+  if (is_key_temporal_filter_enabled && source != NULL) {
+    // Produce the filtered Key frame. Set distance to -1 since the key frame
+    // is already popped out.
+    vp9_temporal_filter(cpi, -1);
+    vpx_extend_frame_borders(&cpi->tf_buffer);
+    force_src_buffer = &cpi->tf_buffer;
+    cpi->un_scaled_source = cpi->Source =
+        force_src_buffer ? force_src_buffer : &source->img;
+  }
 #endif  // !CONFIG_REALTIME_ONLY
 
   if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
@@ -6717,6 +6737,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (cpi->keep_level_stats && oxcf->pass != 1)
     update_level_info(cpi, size, arf_src_index);
 
+#if !CONFIG_REALTIME_ONLY
+  if (is_key_temporal_filter_enabled && cpi->b_calculate_psnr) {
+    cpi->raw_source_frame = vp9_scale_if_required(
+        cm, source_buffer, &cpi->scaled_source, (oxcf->pass == 0), EIGHTTAP, 0);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1 && !cpi->last_frame_dropped) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
@@ -263,6 +263,8 @@ typedef struct VP9EncoderConfig {
 
   int enable_tpl_model;
 
+  int enable_keyframe_filtering;
+
   int max_threads;
 
   unsigned int target_level;
@@ -503,6 +505,7 @@ typedef struct ARNRFilterData {
   int frame_count;
   int alt_ref_index;
   struct scale_factors sf;
+  YV12_BUFFER_CONFIG *dst;
 } ARNRFilterData;
 
 typedef struct EncFrameBuf {
@@ -872,7 +875,7 @@ typedef struct VP9_COMP {
   // Force recalculation of segment_ids for each mode info
   uint8_t force_update_segmentation;
 
-  YV12_BUFFER_CONFIG alt_ref_buffer;
+  YV12_BUFFER_CONFIG tf_buffer;
 
   // class responsible for adaptive
   // quantization of altref frames

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
@@ -48,6 +48,7 @@ static int64_t highbd_index_mult[14] = { 0U,          0U,          0U,
 // Prediction function using 12-tap interpolation filter.
 // TODO([email protected]): add SIMD optimization.
 #define MAX_FILTER_TAP 12
+#define TF_INTERP_EXTEND 6
 typedef int16_t InterpKernel12[MAX_FILTER_TAP];
 // 12-tap filter (used by the encoder only).
 DECLARE_ALIGNED(256, static const InterpKernel12,
@@ -861,6 +862,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
   DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
   MACROBLOCKD *mbd = &td->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  YV12_BUFFER_CONFIG *dst = arnr_filter_data->dst;
   uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]);
@@ -886,18 +888,17 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
 
   // Source frames are extended to 16 pixels. This is different than
   //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
-  // A 6/8 tap filter is used for motion search.  This requires 2 pixels
-  //  before and 3 pixels after.  So the largest Y mv on a border would
-  //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
-  //  Y and therefore only extended by 8.  The largest mv that a UV block
-  //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
-  //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
-  //  8 - VP9_INTERP_EXTEND.
-  // To keep the mv in play for both Y and UV planes the max that it
-  //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-  td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND));
+  // A 6/8/12 tap filter is used for motion search and prediction. So the
+  // largest Y mv on a border would then be 16 - TF_INTERP_EXTEND. The UV
+  // blocks are half the size of the Y and therefore only extended by 8.
+  // The largest mv that a UV block can support is 8 - TF_INTERP_EXTEND.
+  // A UV mv is half of a Y mv. (16 - TF_INTERP_EXTEND) >> 1 is greater than
+  // 8 - TF_INTERP_EXTEND. To keep the mv in play for both Y and UV planes,
+  // the max that it can be on a border is therefore 16 - (2 * TF_INTERP_EXTEND
+  // + 1).
+  td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * TF_INTERP_EXTEND));
   td->mb.mv_limits.row_max =
-      ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND);
+      ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * TF_INTERP_EXTEND);
 
   for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) {
     int i, j, k;
@@ -907,9 +908,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
     vp9_zero_array(accumulator, BLK_PELS * 3);
     vp9_zero_array(count, BLK_PELS * 3);
 
-    td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND));
+    td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * TF_INTERP_EXTEND));
     td->mb.mv_limits.col_max =
-        ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND);
+        ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * TF_INTERP_EXTEND);
 
     if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
       unsigned int src_variance;
@@ -1054,9 +1055,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       uint16_t *dst1_16;
       uint16_t *dst2_16;
       // Normalize filter output to produce AltRef frame
-      dst1 = cpi->alt_ref_buffer.y_buffer;
+      dst1 = dst->y_buffer;
       dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-      stride = cpi->alt_ref_buffer.y_stride;
+      stride = dst->y_stride;
       byte = mb_y_offset;
       for (i = 0, k = 0; i < BH; i++) {
         for (j = 0; j < BW; j++, k++) {
@@ -1073,11 +1074,11 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
         byte += stride - BW;
       }
 
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
+      dst1 = dst->u_buffer;
+      dst2 = dst->v_buffer;
       dst1_16 = CONVERT_TO_SHORTPTR(dst1);
       dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-      stride = cpi->alt_ref_buffer.uv_stride;
+      stride = dst->uv_stride;
       byte = mb_uv_offset;
       for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
         for (j = 0; j < mb_uv_width; j++, k++) {
@@ -1103,8 +1104,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       }
     } else {
       // Normalize filter output to produce AltRef frame
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
+      dst1 = dst->y_buffer;
+      stride = dst->y_stride;
       byte = mb_y_offset;
       for (i = 0, k = 0; i < BH; i++) {
         for (j = 0; j < BW; j++, k++) {
@@ -1120,9 +1121,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
         byte += stride - BW;
       }
 
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
+      dst1 = dst->u_buffer;
+      dst2 = dst->v_buffer;
+      stride = dst->uv_stride;
       byte = mb_uv_offset;
       for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
         for (j = 0; j < mb_uv_width; j++, k++) {
@@ -1148,8 +1149,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
     }
 #else
     // Normalize filter output to produce AltRef frame
-    dst1 = cpi->alt_ref_buffer.y_buffer;
-    stride = cpi->alt_ref_buffer.y_stride;
+    dst1 = dst->y_buffer;
+    stride = dst->y_stride;
     byte = mb_y_offset;
     for (i = 0, k = 0; i < BH; i++) {
       for (j = 0; j < BW; j++, k++) {
@@ -1165,9 +1166,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       byte += stride - BW;
     }
 
-    dst1 = cpi->alt_ref_buffer.u_buffer;
-    dst2 = cpi->alt_ref_buffer.v_buffer;
-    stride = cpi->alt_ref_buffer.uv_stride;
+    dst1 = dst->u_buffer;
+    dst2 = dst->v_buffer;
+    stride = dst->uv_stride;
     byte = mb_uv_offset;
     for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
       for (j = 0; j < mb_uv_width; j++, k++) {
@@ -1233,10 +1234,10 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
                                int *arnr_frames, int *frames_backward,
                                int *frames_forward, int *arnr_strength) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
 
-  int max_fwd = vp9_lookahead_depth(cpi->lookahead) - distance - 1;
-  int max_bwd = distance;
+  int max_fwd =
+      VPXMAX((int)vp9_lookahead_depth(cpi->lookahead) - distance - 1, 0);
+  int max_bwd = VPXMAX(distance, 0);
   int frames = VPXMAX(oxcf->arnr_max_frames, 1);
   int q, base_strength, strength;
 
@@ -1265,16 +1266,15 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
 
   // Adjust number of frames in filter and strength based on gf boost level.
   frames = VPXMIN(frames, group_boost / 150);
-  frames += !(frames & 1);  // Make the number odd.
 
   if (strength > group_boost / 300) {
     strength = group_boost / 300;
   }
 
   if (VPXMIN(max_fwd, max_bwd) >= frames / 2) {
-    // just use half half
+    // Handle the even/odd case.
     *frames_backward = frames / 2;
-    *frames_forward = frames / 2;
+    *frames_forward = (frames - 1) / 2;
   } else {
     if (max_fwd < frames / 2) {
       *frames_forward = max_fwd;
@@ -1297,8 +1297,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
   // TODO(jingning): Skip temporal filtering for intermediate frames that will
   // be used as show_existing_frame. Need to further explore the possibility to
   // apply certain filter.
-  if (gf_group->arf_src_offset[gf_group->index] <
-      cpi->rc.baseline_gf_interval - 1) {
+  if (frames <= 1) {
     frames = 1;
     *frames_backward = 0;
     *frames_forward = 0;
@@ -1332,6 +1331,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   arnr_filter_data->strength = strength;
   arnr_filter_data->frame_count = frames_to_blur;
   arnr_filter_data->alt_ref_index = frames_to_blur_backward;
+  arnr_filter_data->dst = &cpi->tf_buffer;
 
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
@@ -1341,6 +1341,11 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
     frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
+  YV12_BUFFER_CONFIG *f = frames[arnr_filter_data->alt_ref_index];
+  xd->cur_buf = f;
+  xd->plane[1].subsampling_y = f->subsampling_y;
+  xd->plane[1].subsampling_x = f->subsampling_x;
+
   if (frames_to_blur > 0) {
     // Setup scaling factors. Scaling on each of the arnr frames is not
     // supported.

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
@@ -44,6 +44,7 @@ typedef struct vp9_extracfg {
   unsigned int tile_columns;
   unsigned int tile_rows;
   unsigned int enable_tpl_model;
+  unsigned int enable_keyframe_filtering;
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
@@ -83,6 +84,7 @@ static struct vp9_extracfg default_extra_cfg = {
   6,                     // tile_columns
   0,                     // tile_rows
   1,                     // enable_tpl_model
+  0,                     // enable_keyframe_filtering
   7,                     // arnr_max_frames
   5,                     // arnr_strength
   0,                     // min_gf_interval; 0 -> default decision
@@ -614,6 +616,8 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
 
+  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+
   // TODO(yunqing): The dependencies between row tiles cause error in multi-
   // threaded encoding. For now, tile_rows is forced to be 0 in this case.
   // The further fix can be done by adding synchronizations after a tile row
@@ -965,6 +969,14 @@ static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_keyframe_filtering(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_keyframe_filtering =
+      CAST(VP9E_SET_KEY_FRAME_FILTERING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -2108,6 +2120,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows },
   { VP9E_SET_TPL, ctrl_set_tpl_model },
+  { VP9E_SET_KEY_FRAME_FILTERING, ctrl_set_keyframe_filtering },
   { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type },
@@ -2456,6 +2469,8 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
 
   DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model);
 
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_keyframe_filtering);
+
   DUMP_STRUCT_VALUE(fp, oxcf, max_threads);
 
   DUMP_STRUCT_VALUE(fp, oxcf, target_level);

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
@@ -673,6 +673,14 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_TPL,
 
+  /*!\brief Codec control function to enable key frame temporal filtering.
+   *
+   * Vp9 allows the encoder to run key frame temporal filtering and use it to
+   * improve the compression performance. To enable, set this parameter to be
+   * 1. The default value is set to be 0.
+   */
+  VP9E_SET_KEY_FRAME_FILTERING,
+
   /*!\brief Codec control function to enable postencode frame drop.
    *
    * This will allow encoder to drop frame after it's encoded.
@@ -1078,6 +1086,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
 #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
 VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
 #define VPX_CTRL_VP9E_SET_TPL
+VPX_CTRL_USE_TYPE(VP9E_SET_KEY_FRAME_FILTERING, int)
+#define VPX_CTRL_VP9E_SET_KEY_FRAME_FILTERING
 VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
 #define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
 VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)