Fix a hang in sync

[handbrake-jp/handbrake-jp-git.git] / libhb / decomb.c
diff --git a/libhb/decomb.c b/libhb/decomb.c

index e5779c8..0873d4b 100644 (file)
--- a/libhb/decomb.c
+++ b/libhb/decomb.c
@@ -4,14 +4,74 @@
     Homepage: <http://handbrake.fr/>.
     It may be used under the terms of the GNU General Public License. 
     
-   The yadif algorithm was created by Michael Niedermayer. */
+   The yadif algorithm was created by Michael Niedermayer.
+   Tritical's work inspired much of the comb detection code:
+   http://web.missouri.edu/~kes25c/
+*/
+
+/*****
+Parameters:
+    Mode : Spatial metric : Motion thresh : Spatial thresh : Block thresh :
+    Block width : Block height
+
+Appended for EEDI2:
+    Magnitude thresh : Variance thresh : Laplacian thresh : Dilation thresh :
+    Erosion thresh : Noise thresh : Max search distance : Post-processing
+
+Plus:
+    Parity
+    
+Defaults:
+    7:2:6:9:80:16:16:10:20:20:4:2:50:24:1:-1
+*****/
+
+#define MODE_YADIF       1 // Use yadif
+#define MODE_BLEND       2 // Use blending interpolation
+#define MODE_CUBIC       4 // Use cubic interpolation
+#define MODE_EEDI2       8 // Use EEDI2 interpolation
+#define MODE_MCDEINT    16 // Post-process with mcdeint
+#define MODE_MASK       32 // Output combing masks instead of pictures
+
+/***** 
+These modes can be layered. For example, Yadif (1) + EEDI2 (8) = 9,
+which will feed EEDI2 interpolations to yadif.
+
+** Working combos:
+ 1: Just yadif
+ 2: Just blend
+ 3: Switch between yadif and blend
+ 4: Just cubic interpolate
+ 5: Cubic->yadif
+ 6: Switch between cubic and blend
+ 7: Switch between cubic->yadif and blend
+ 8: Just EEDI2 interpolate
+ 9: EEDI2->yadif
+10: Switch between EEDI2 and blend
+11: Switch between EEDI2->yadif and blend
+17: Yadif->mcdeint
+18: Blend->mcdeint
+19: Switch between blending and yadif -> mcdeint
+20: Cubic->mdeint
+21: Cubic->yadif->mcdeint
+22: Cubic or blend -> mcdeint
+23: Cubic->yadif or blend -> mcdeint
+24: EEDI2->mcdeint
+25: EEDI2->yadif->mcdeint
+...okay I'm getting bored now listing all these different modes
+32: Passes through the combing mask for every combed frame (white for combed pixels, otherwise black)
+33+: Overlay the combing mask for every combed frame on top of the filtered output (white for combed pixels)
+
+12-15: EEDI2 will override cubic interpolation
+16: DOES NOT WORK BY ITSELF-- mcdeint needs to be fed by another deinterlacer
+*****/
+
  #include "hb.h"
-#include "ffmpeg/avcodec.h"
+#include "hbffmpeg.h"
  #include "mpeg2dec/mpeg2.h"
+#include "eedi2.h"
  
  #define SUPPRESS_AV_LOG
  
-#define MODE_DEFAULT     4
  #define PARITY_DEFAULT   -1
  
  #define MCDEINT_MODE_DEFAULT   -1
@@ -21,14 +81,80 @@
  #define MIN3(a,b,c) MIN(MIN(a,b),c)
  #define MAX3(a,b,c) MAX(MAX(a,b),c)
  
+// Some names to correspond to the pv->eedi_half array's contents
+#define SRCPF 0
+#define MSKPF 1
+#define TMPPF 2
+#define DSTPF 3
+// Some names to correspond to the pv->eedi_full array's contents
+#define DST2PF 0
+#define TMP2PF2 1
+#define MSK2PF 2
+#define TMP2PF 3
+#define DST2MPF 4
+
+struct yadif_arguments_s {
+    uint8_t **dst;
+    int parity;
+    int tff;
+    int stop;
+    int is_combed;
+};
+
+struct decomb_arguments_s {
+    int stop;
+};
+
+struct eedi2_arguments_s {
+    int stop;
+};
+
+typedef struct yadif_arguments_s yadif_arguments_t;
+typedef struct decomb_arguments_s decomb_arguments_t;
+typedef struct eedi2_arguments_s eedi2_arguments_t;
+
+typedef struct eedi2_thread_arg_s {
+    hb_filter_private_t *pv;
+    int plane;
+} eedi2_thread_arg_t;
+
+typedef struct decomb_thread_arg_s {
+    hb_filter_private_t *pv;
+    int segment;
+} decomb_thread_arg_t;
+
+typedef struct yadif_thread_arg_s {
+    hb_filter_private_t *pv;
+    int segment;
+} yadif_thread_arg_t;
+
  struct hb_filter_private_s
  {
      int              pix_fmt;
      int              width[3];
      int              height[3];
  
+    // Decomb parameters
      int              mode;
+    int              spatial_metric;
+    int              motion_threshold;
+    int              spatial_threshold;
+    int              block_threshold;
+    int              block_width;
+    int              block_height;
+    
+    // EEDI2 parameters
+    int              magnitude_threshold;
+    int              variance_threshold;
+    int              laplacian_threshold;
+    int              dilation_threshold;
+    int              erosion_threshold;
+    int              noise_threshold;
+    int              maximum_search_distance;
+    int              post_processing;
+
      int              parity;
+    int              tff;
      
      int              yadif_ready;
  
@@ -41,27 +167,46 @@ struct hb_filter_private_s
      AVFrame        * mcdeint_frame;
      AVFrame        * mcdeint_frame_dec;
  
-    int              comb;
-    int              color_equal;
-    int              color_diff;
-    int              threshold;
-    int              prog_equal;
-    int              prog_diff;
-    int              prog_threshold;
      int              deinterlaced_frames;
-    int              passed_frames;
+    int              blended_frames;
+    int              unfiltered_frames;
  
      uint8_t        * ref[4][3];
      int              ref_stride[3];
  
+    /* Make a buffer to store a comb mask. */
+    uint8_t        * mask[3];
+
+    uint8_t        * eedi_half[4][3];
+    uint8_t        * eedi_full[5][3];
+    int            * cx2;
+    int            * cy2;
+    int            * cxy;
+    int            * tmpc;
+    
      AVPicture        pic_in;
      AVPicture        pic_out;
      hb_buffer_t *    buf_out[2];
      hb_buffer_t *    buf_settings;
      
-    int              cc_array[3][480][270];
-    int              combed_macroblocks;
-    int              uncombed_macroblocks;
+    int              cpu_count;
+
+    hb_thread_t    ** yadif_threads;         // Threads for Yadif - one per CPU
+    hb_lock_t      ** yadif_begin_lock;      // Thread has work
+    hb_lock_t      ** yadif_complete_lock;   // Thread has completed work
+    yadif_arguments_t *yadif_arguments;      // Arguments to thread for work
+    
+    hb_thread_t    ** decomb_threads;        // Threads for comb detection - one per CPU
+    hb_lock_t      ** decomb_begin_lock;     // Thread has work
+    hb_lock_t      ** decomb_complete_lock;  // Thread has completed work
+    decomb_arguments_t *decomb_arguments;    // Arguments to thread for work
+
+    hb_thread_t    ** eedi2_threads;        // Threads for eedi2 - one per plane
+    hb_lock_t      ** eedi2_begin_lock;     // Thread has work
+    hb_lock_t      ** eedi2_complete_lock;  // Thread has completed work
+    eedi2_arguments_t *eedi2_arguments;    // Arguments to thread for work
+
+//    int              alternator;           // for bobbing parity when framedoubling
  };
  
  hb_filter_private_t * hb_decomb_init( int pix_fmt,
@@ -69,7 +214,7 @@ hb_filter_private_t * hb_decomb_init( int pix_fmt,
                                             int height,
                                             char * settings );
  
-int hb_decomb_work( hb_buffer_t * buf_in,
+int hb_decomb_work(      const hb_buffer_t * buf_in,
                           hb_buffer_t ** buf_out,
                           int pix_fmt,
                           int width,
@@ -81,14 +226,14 @@ void hb_decomb_close( hb_filter_private_t * pv );
  hb_filter_object_t hb_filter_decomb =
  {
      FILTER_DECOMB,
-    "Decombs selectively with (ffmpeg or yadif/mcdeint or blending)",
+    "Decomb",
      NULL,
      hb_decomb_init,
      hb_decomb_work,
      hb_decomb_close,
  };
  
-int cubic_interpolate( int y0, int y1, int y2, int y3 )
+int cubic_interpolate_pixel( int y0, int y1, int y2, int y3 )
  {
      /* From http://www.neuron2.net/library/cubicinterp.html */
      int result = ( y0 * -3 ) + ( y1 * 23 ) + ( y2 * 23 ) + ( y3 * -3 );
@@ -106,6 +251,111 @@ int cubic_interpolate( int y0, int y1, int y2, int y3 )
      return result;
  }
  
+static void cubic_interpolate_line( uint8_t *dst,
+                               uint8_t *cur,
+                               int plane,
+                               int y,
+                               hb_filter_private_t * pv )
+{
+    int w = pv->width[plane];
+    int refs = pv->ref_stride[plane];
+    int x;
+
+    for( x = 0; x < w; x++)
+    {
+        int a, b, c, d;
+        a = b = c = d = 0;
+        
+        if( y >= 3 )
+        {
+            /* Normal top*/
+            a = cur[-3*refs];
+            b = cur[-refs];
+        }
+        else if( y == 2 || y == 1 )
+        {
+            /* There's only one sample above this pixel, use it twice. */
+            a = cur[-refs];
+            b = cur[-refs];
+        }
+        else if( y == 0 )
+        {
+            /* No samples above, triple up on the one below. */
+            a = cur[+refs];
+            b = cur[+refs];
+        }
+        
+        if( y <= ( pv->height[plane] - 4 ) )
+        {
+            /* Normal bottom*/
+            c = cur[+refs];
+            d = cur[3*refs];            
+        }
+        else if( y == ( pv->height[plane] - 3 ) || y == ( pv->height[plane] - 2 ) )
+        {
+            /* There's only one sample below, use it twice. */
+            c = cur[+refs];
+            d = cur[+refs];
+        }
+        else if( y == pv->height[plane] - 1)
+        {
+            /* No samples below, triple up on the one above. */
+            c = cur[-refs];
+            d = cur[-refs];
+        }
+        
+        dst[0] = cubic_interpolate_pixel( a, b, c, d );
+        
+        dst++;
+        cur++;
+    }
+}
+
+void apply_mask_line( uint8_t * srcp,
+                      uint8_t * mskp,
+                      int width )
+{
+    int x;
+    
+    for( x = 0; x < width; x++ )
+    {
+        if( mskp[x] == 255 )
+        {
+            srcp[x] = 255;
+        }
+    }
+}
+
+void apply_mask( hb_filter_private_t * pv )
+{
+    int plane, height;
+    
+    for( plane = 0; plane < 3; plane++ )
+    {
+        uint8_t * srcp = ( pv->mode & MODE_MCDEINT ) ? pv->pic_in.data[plane] : pv->pic_out.data[plane];
+        uint8_t * mskp = pv->mask[plane];
+        
+        for( height = 0; height < pv->height[plane]; height++ )
+        {
+            if( pv->mode == MODE_MASK && plane == 0 )
+            {
+                memcpy( srcp, mskp, pv->width[plane] );
+            }
+            else if( pv->mode == MODE_MASK )
+            {
+                memset( srcp, 128, pv->width[plane] );
+            }
+            else if( plane == 0 )
+            {
+                apply_mask_line( srcp, mskp, pv->width[plane] );
+            }
+
+            srcp += pv->pic_out.linesize[plane];
+            mskp += pv->ref_stride[plane];
+        }
+    }
+}
+
  static void store_ref( const uint8_t ** pic,
                               hb_filter_private_t * pv )
  {
@@ -128,7 +378,7 @@ static void store_ref( const uint8_t ** pic,
          int ref_stride = pv->ref_stride[i];
  
          int y;
-        for( y = 0; y < pv->height[i]; y++ )
+        for( y = 0; y < h; y++ )
          {
              memcpy(ref, src, w);
              src = (uint8_t*)src + w;
@@ -137,6 +387,8 @@ static void store_ref( const uint8_t ** pic,
      }
  }
  
+/* This function may be useful in the future, if we want to output
+   a reference to an AVPicture, since they have different strides.
  static void get_ref( uint8_t ** pic, hb_filter_private_t * pv, int frm )
  {
      int i;
@@ -156,6 +408,7 @@ static void get_ref( uint8_t ** pic, hb_filter_private_t * pv, int frm )
          }
      }
  }
+*/
  
  int blend_filter_pixel( int up2, int up1, int current, int down1, int down2 )
  {
@@ -230,6 +483,514 @@ static void blend_filter_line( uint8_t *dst,
      }
  }
  
+int check_combing_mask( hb_filter_private_t * pv )
+{
+    /* Go through the mask in X*Y blocks. If any of these windows
+       have threshold or more combed pixels, consider the whole
+       frame to be combed and send it on to be deinterlaced.     */
+
+    /* Block mask threshold -- The number of pixels
+       in a block_width * block_height window of
+       he mask that need to show combing for the
+       whole frame to be seen as such.            */
+    int threshold       = pv->block_threshold;
+    int block_width     = pv->block_width;
+    int block_height    = pv->block_height;
+    int block_x, block_y;
+    int block_score = 0; int send_to_blend = 0;
+    uint8_t * mask_p;
+    int x, y, k;
+
+    for( k = 0; k < 1; k++ )
+    {
+        int ref_stride = pv->ref_stride[k];
+        for( y = 0; y < ( pv->height[k] - block_height ); y = y + block_height )
+        {
+            for( x = 0; x < ( pv->width[k] - block_width ); x = x + block_width )
+            {
+                block_score = 0;
+                
+                for( block_y = 0; block_y < block_height; block_y++ )
+                {
+                    int mask_y = y + block_y;
+                    mask_p = &pv->mask[k][mask_y*ref_stride + x];
+                    
+                    for( block_x = 0; block_x < block_width; block_x++ )
+                    {
+                        /* We only want to mark a pixel in a block as combed
+                           if the adjacent pixels are as well. Got to
+                           handle the sides separately.       */
+                        if( (x + block_x) == 0 )
+                        {
+                            if( mask_p[ 0 ] == 255 &&
+                                mask_p[ 1 ] == 255 )
+                                    block_score++;
+                        }
+                        else if( (x + block_x) == (pv->width[k] -1) )
+                        {
+                            if( mask_p[ -1 ] == 255 &&
+                                mask_p[  0 ] == 255 )
+                                    block_score++;
+                        }
+                        else
+                        {
+                            if( mask_p[ -1 ] == 255 &&
+                                mask_p[  0 ] == 255 &&
+                                mask_p[  1 ] == 255 )
+                                    block_score++;
+                        }
+                        mask_p++;
+                    }
+                }
+
+                if( block_score >= ( threshold / 2 ) )
+                {
+#if 0
+                    hb_log("decomb: frame %i | score %i | type %s", pv->deinterlaced_frames + pv->blended_frames +  pv->unfiltered_frames + 1, block_score, pv->buf_settings->flags & 16 ? "Film" : "Video");
+#endif
+                    if ( block_score <= threshold && !( pv->buf_settings->flags & 16) )
+                    {
+                        /* Blend video content that scores between
+                           ( threshold / 2 ) and threshold.        */
+                        send_to_blend = 1;
+                    }
+                    else if( block_score > threshold )
+                    {
+                        if( pv->buf_settings->flags & 16 )
+                        {
+                            /* Blend progressive content above the threshold.*/
+                            return 2;
+                        }
+                        else
+                        {
+                            /* Yadif deinterlace video content above the threshold. */
+                            return 1;
+                        }
+                    }
+                }
+            }
+        } 
+    }
+    
+    if( send_to_blend )
+    {
+        return 2;
+    }
+    else
+    {
+        /* Consider this frame to be uncombed. */
+        return 0;
+    }
+}
+
+void detect_combed_segment( hb_filter_private_t * pv, int segment_start, int segment_stop )
+{
+    /* A mish-mash of various comb detection tricks
+       picked up from neuron2's Decomb plugin for
+       AviSynth and tritical's IsCombedT and
+       IsCombedTIVTC plugins.                       */
+       
+    int x, y, k, width, height;
+    
+    /* Comb scoring algorithm */
+    int spatial_metric  = pv->spatial_metric;
+    /* Motion threshold */
+    int mthresh         = pv->motion_threshold;
+    /* Spatial threshold */
+    int athresh         = pv->spatial_threshold;
+    int athresh_squared = athresh * athresh;
+    int athresh6        = 6 *athresh;
+
+    /* One pas for Y, one pass for U, one pass for V */    
+    for( k = 0; k < 1; k++ )
+    {
+        int ref_stride  = pv->ref_stride[k];
+        width           = pv->width[k];
+        height          = pv->height[k];
+        
+        /* Comb detection has to start at y = 2 and end at
+           y = height - 2, because it needs to examine
+           2 pixels above and 2 below the current pixel.      */
+        if( segment_start < 2 )
+            segment_start = 2;
+        if( segment_stop > height - 2 )
+            segment_stop = height - 2;
+            
+        for( y =  segment_start; y < segment_stop; y++ )
+        {
+            /* These are just to make the buffer locations easier to read. */
+            int up_2    = -2*ref_stride ;
+            int up_1    = -1*ref_stride;
+            int down_1 = ref_stride;
+            int down_2 = 2*ref_stride;
+            
+            /* We need to examine a column of 5 pixels
+               in the prev, cur, and next frames.      */
+            uint8_t * cur = &pv->ref[1][k][y*ref_stride];
+            uint8_t * prev = &pv->ref[0][k][y*ref_stride];
+            uint8_t * next = &pv->ref[2][k][y*ref_stride];
+            uint8_t * mask = &pv->mask[k][y*ref_stride];
+            
+            for( x = 0; x < width; x++ )
+            {
+                int up_diff = cur[0] - cur[up_1];
+                int down_diff = cur[0] - cur[down_1];
+                
+                if( ( up_diff >  athresh && down_diff >  athresh ) ||
+                    ( up_diff < -athresh && down_diff < -athresh ) )
+                {
+                    /* The pixel above and below are different,
+                       and they change in the same "direction" too.*/
+                    int motion = 0;
+                    if( mthresh > 0 )
+                    {
+                        /* Make sure there's sufficient motion between frame t-1 to frame t+1. */
+                        if( abs( prev[0] - cur[0] ) > mthresh &&
+                            abs(  cur[up_1] - next[up_1]    ) > mthresh &&
+                            abs(  cur[down_1] - next[down_1]    ) > mthresh )
+                                motion++;
+                        if( abs(     next[0] - cur[0] ) > mthresh &&
+                            abs( prev[up_1] - cur[up_1] ) > mthresh &&
+                            abs( prev[down_1] - cur[down_1] ) > mthresh )
+                                motion++;
+                    }
+                    else
+                    {
+                        /* User doesn't want to check for motion,
+                           so move on to the spatial check.       */
+                        motion = 1;
+                    }
+                           
+                    if( motion || ( pv->deinterlaced_frames==0 && pv->blended_frames==0 && pv->unfiltered_frames==0) )
+                    {
+                           /* That means it's time for the spatial check.
+                              We've got several options here.             */
+                        if( spatial_metric == 0 )
+                        {
+                            /* Simple 32detect style comb detection */
+                            if( ( abs( cur[0] - cur[down_2] ) < 10  ) &&
+                                ( abs( cur[0] - cur[down_1] ) > 15 ) )
+                            {
+                                mask[0] = 255;
+                            }
+                            else
+                            {
+                                mask[0] = 0;
+                            }
+                        }
+                        else if( spatial_metric == 1 )
+                        {
+                            /* This, for comparison, is what IsCombed uses.
+                               It's better, but still noise senstive.      */
+                               int combing = ( cur[up_1] - cur[0] ) *
+                                             ( cur[down_1] - cur[0] );
+                               
+                               if( combing > athresh_squared )
+                                   mask[0] = 255; 
+                               else
+                                   mask[0] = 0;
+                        }
+                        else if( spatial_metric == 2 )
+                        {
+                            /* Tritical's noise-resistant combing scorer.
+                               The check is done on a bob+blur convolution. */
+                            int combing = abs( cur[up_2]
+                                             + ( 4 * cur[0] )
+                                             + cur[down_2]
+                                             - ( 3 * ( cur[up_1]
+                                                     + cur[down_1] ) ) );
+
+                            /* If the frame is sufficiently combed,
+                               then mark it down on the mask as 255. */
+                            if( combing > athresh6 )
+                            {
+                                mask[0] = 255;
+                            }
+                            else
+                            {
+                                mask[0] = 0;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        mask[0] = 0;
+                    }
+                }
+                else
+                {
+                    mask[0] = 0;
+                }
+                
+                cur++;
+                prev++;
+                next++;
+                mask++;
+            }
+        }
+    }
+}
+
+// This function calls all the eedi2 filters in sequence for a given plane.
+// It outputs the final interpolated image to pv->eedi_full[DST2PF].
+void eedi2_interpolate_plane( hb_filter_private_t * pv, int k )
+{
+    /* We need all these pointers. No, seriously.
+       I swear. It's not a joke. They're used.
+       All nine of them.                         */
+    uint8_t * mskp = pv->eedi_half[MSKPF][k];
+    uint8_t * srcp = pv->eedi_half[SRCPF][k];
+    uint8_t * tmpp = pv->eedi_half[TMPPF][k];
+    uint8_t * dstp = pv->eedi_half[DSTPF][k];
+    uint8_t * dst2p = pv->eedi_full[DST2PF][k];
+    uint8_t * tmp2p2 = pv->eedi_full[TMP2PF2][k];
+    uint8_t * msk2p = pv->eedi_full[MSK2PF][k];
+    uint8_t * tmp2p = pv->eedi_full[TMP2PF][k];
+    uint8_t * dst2mp = pv->eedi_full[DST2MPF][k];
+    int * cx2 = pv->cx2;
+    int * cy2 = pv->cy2;
+    int * cxy = pv->cxy;
+    int * tmpc = pv->tmpc;
+
+    int pitch = pv->ref_stride[k];
+    int height = pv->height[k]; int width = pv->width[k];
+    int half_height = height / 2;
+
+    // edge mask
+    eedi2_build_edge_mask( mskp, pitch, srcp, pitch,
+                     pv->magnitude_threshold, pv->variance_threshold, pv->laplacian_threshold, 
+                     half_height, width );
+    eedi2_erode_edge_mask( mskp, pitch, tmpp, pitch, pv->erosion_threshold, half_height, width );
+    eedi2_dilate_edge_mask( tmpp, pitch, mskp, pitch, pv->dilation_threshold, half_height, width );
+    eedi2_erode_edge_mask( mskp, pitch, tmpp, pitch, pv->erosion_threshold, half_height, width );
+    eedi2_remove_small_gaps( tmpp, pitch, mskp, pitch, half_height, width );
+
+    // direction mask
+    eedi2_calc_directions( k, mskp, pitch, srcp, pitch, tmpp, pitch,
+                     pv->maximum_search_distance, pv->noise_threshold,
+                     half_height, width );
+    eedi2_filter_dir_map( mskp, pitch, tmpp, pitch, dstp, pitch, half_height, width );
+    eedi2_expand_dir_map( mskp, pitch, dstp, pitch, tmpp, pitch, half_height, width );
+    eedi2_filter_map( mskp, pitch, tmpp, pitch, dstp, pitch, half_height, width );
+
+    // upscale 2x vertically
+    eedi2_upscale_by_2( srcp, dst2p, half_height, pitch );
+    eedi2_upscale_by_2( dstp, tmp2p2, half_height, pitch );
+    eedi2_upscale_by_2( mskp, msk2p, half_height, pitch );
+
+    // upscale the direction mask
+    eedi2_mark_directions_2x( msk2p, pitch, tmp2p2, pitch, tmp2p, pitch, pv->tff, height, width );
+    eedi2_filter_dir_map_2x( msk2p, pitch, tmp2p, pitch,  dst2mp, pitch, pv->tff, height, width );
+    eedi2_expand_dir_map_2x( msk2p, pitch, dst2mp, pitch, tmp2p, pitch, pv->tff, height, width );
+    eedi2_fill_gaps_2x( msk2p, pitch, tmp2p, pitch, dst2mp, pitch, pv->tff, height, width );
+    eedi2_fill_gaps_2x( msk2p, pitch, dst2mp, pitch, tmp2p, pitch, pv->tff, height, width );
+
+    // interpolate a full-size plane
+    eedi2_interpolate_lattice( k, tmp2p, pitch, dst2p, pitch, tmp2p2, pitch, pv->tff,
+                         pv->noise_threshold, height, width );
+
+    if( pv->post_processing == 1 || pv->post_processing == 3 )
+    {
+        // make sure the edge directions are consistent
+        eedi2_bit_blit( tmp2p2, pitch, tmp2p, pitch, pv->width[k], pv->height[k] );
+        eedi2_filter_dir_map_2x( msk2p, pitch, tmp2p, pitch, dst2mp, pitch, pv->tff, height, width );
+        eedi2_expand_dir_map_2x( msk2p, pitch, dst2mp, pitch, tmp2p, pitch, pv->tff, height, width );
+        eedi2_post_process( tmp2p, pitch, tmp2p2, pitch, dst2p, pitch, pv->tff, height, width );
+    }
+    if( pv->post_processing == 2 || pv->post_processing == 3 )
+    {
+        // filter junctions and corners
+        eedi2_gaussian_blur1( srcp, pitch, tmpp, pitch, srcp, pitch, half_height, width );
+        eedi2_calc_derivatives( srcp, pitch, half_height, width, cx2, cy2, cxy );
+        eedi2_gaussian_blur_sqrt2( cx2, tmpc, cx2, pitch, half_height, width);
+        eedi2_gaussian_blur_sqrt2( cy2, tmpc, cy2, pitch, half_height, width);
+        eedi2_gaussian_blur_sqrt2( cxy, tmpc, cxy, pitch, half_height, width);
+        eedi2_post_process_corner( cx2, cy2, cxy, pitch, tmp2p2, pitch, dst2p, pitch, height, width, pv->tff );
+    }
+}
+
+/*
+ *  eedi2 interpolate this plane in a single thread.
+ */
+void eedi2_filter_thread( void *thread_args_v )
+{
+    eedi2_arguments_t *eedi2_work = NULL;
+    hb_filter_private_t * pv;
+    int run = 1;
+    int plane;
+    eedi2_thread_arg_t *thread_args = thread_args_v;
+
+    pv = thread_args->pv;
+    plane = thread_args->plane;
+
+    hb_log("eedi2 thread started for plane %d", plane);
+
+    while( run )
+    {
+        /*
+         * Wait here until there is work to do. hb_lock() blocks until
+         * render releases it to say that there is more work to do.
+         */
+        hb_lock( pv->eedi2_begin_lock[plane] );
+
+        eedi2_work = &pv->eedi2_arguments[plane];
+
+        if( eedi2_work->stop )
+        {
+            /*
+             * No more work to do, exit this thread.
+             */
+            run = 0;
+            continue;
+        } 
+
+        /*
+         * Process plane
+         */
+            eedi2_interpolate_plane( pv, plane );
+        
+        /*
+         * Finished this segment, let everyone know.
+         */
+        hb_unlock( pv->eedi2_complete_lock[plane] );
+    }
+    free( thread_args_v );
+}
+
+// Sets up the input field planes for EEDI2 in pv->eedi_half[SRCPF]
+// and then runs eedi2_filter_thread for each plane.
+void eedi2_planer( hb_filter_private_t * pv )
+{
+    /* Copy the first field from the source to a half-height frame. */
+    int i;
+    for( i = 0;  i < 3; i++ )
+    {
+        int pitch = pv->ref_stride[i];
+        int start_line = !pv->tff;
+        eedi2_fill_half_height_buffer_plane( &pv->ref[1][i][pitch*start_line], pv->eedi_half[SRCPF][i], pitch, pv->height[i] );
+    }
+    
+    int plane;
+    for( plane = 0; plane < 3; plane++ )
+    {  
+        /*
+         * Let the thread for this plane know that we've setup work 
+         * for it by releasing the begin lock (ensuring that the
+         * complete lock is already locked so that we block when
+         * we try to lock it again below).
+         */
+        hb_lock( pv->eedi2_complete_lock[plane] );
+        hb_unlock( pv->eedi2_begin_lock[plane] );
+    }
+
+    /*
+     * Wait until all three threads have completed by trying to get
+     * the complete lock that we locked earlier for each thread, which
+     * will block until that thread has completed the work on that
+     * plane.
+     */
+    for( plane = 0; plane < 3; plane++ )
+    {
+        hb_lock( pv->eedi2_complete_lock[plane] );
+        hb_unlock( pv->eedi2_complete_lock[plane] );
+    }
+}
+
+
+/*
+ * comb detect this segment of all three planes in a single thread.
+ */
+void decomb_filter_thread( void *thread_args_v )
+{
+    decomb_arguments_t *decomb_work = NULL;
+    hb_filter_private_t * pv;
+    int run = 1;
+    int segment, segment_start, segment_stop, plane;
+    decomb_thread_arg_t *thread_args = thread_args_v;
+
+    pv = thread_args->pv;
+    segment = thread_args->segment;
+
+    hb_log("decomb thread started for segment %d", segment);
+
+    while( run )
+    {
+        /*
+         * Wait here until there is work to do. hb_lock() blocks until
+         * render releases it to say that there is more work to do.
+         */
+        hb_lock( pv->decomb_begin_lock[segment] );
+
+        decomb_work = &pv->decomb_arguments[segment];
+
+        if( decomb_work->stop )
+        {
+            /*
+             * No more work to do, exit this thread.
+             */
+            run = 0;
+            continue;
+        } 
+
+        /*
+         * Process segment (for now just from luma)
+         */
+        for( plane = 0; plane < 1; plane++)
+        {
+
+            int h = pv->height[plane];
+            segment_start = ( h / pv->cpu_count ) * segment;
+            if( segment == pv->cpu_count - 1 )
+            {
+                /*
+                 * Final segment
+                 */
+                segment_stop = h;
+            } else {
+                segment_stop = ( h / pv->cpu_count ) * ( segment + 1 );
+            }
+            
+            detect_combed_segment( pv, segment_start, segment_stop );
+        }
+        /*
+         * Finished this segment, let everyone know.
+         */
+        hb_unlock( pv->decomb_complete_lock[segment] );
+    }
+    free( thread_args_v );
+}
+
+int comb_segmenter( hb_filter_private_t * pv )
+{
+    int segment;
+
+    for( segment = 0; segment < pv->cpu_count; segment++ )
+    {  
+        /*
+         * Let the thread for this plane know that we've setup work 
+         * for it by releasing the begin lock (ensuring that the
+         * complete lock is already locked so that we block when
+         * we try to lock it again below).
+         */
+        hb_lock( pv->decomb_complete_lock[segment] );
+        hb_unlock( pv->decomb_begin_lock[segment] );
+    }
+
+    /*
+     * Wait until all three threads have completed by trying to get
+     * the complete lock that we locked earlier for each thread, which
+     * will block until that thread has completed the work on that
+     * plane.
+     */
+    for( segment = 0; segment < pv->cpu_count; segment++ )
+    {
+        hb_lock( pv->decomb_complete_lock[segment] );
+        hb_unlock( pv->decomb_complete_lock[segment] );
+    }
+    
+    return check_combing_mask( pv );
+}
+
  static void yadif_filter_line( uint8_t *dst,
                                 uint8_t *prev,
                                 uint8_t *cur,
@@ -239,109 +1000,124 @@ static void yadif_filter_line( uint8_t *dst,
                                 int y,
                                 hb_filter_private_t * pv )
  {
+    /* While prev and next point to the previous and next frames,
+       prev2 and next2 will shift depending on the parity, usually 1.
+       They are the previous and next fields, the fields temporally adjacent
+       to the other field in the current frame--the one not being filtered.  */
      uint8_t *prev2 = parity ? prev : cur ;
      uint8_t *next2 = parity ? cur  : next;
-
+    
      int w = pv->width[plane];
      int refs = pv->ref_stride[plane];
      int x;
-    int macroblock_x;
-    int macroblock_y = y / 8 ;
-    
+    int eedi2_mode = ( pv->mode & MODE_EEDI2 );
      
+    /* We can replace spatial_pred with this interpolation*/
+    uint8_t * eedi2_guess = &pv->eedi_full[DST2PF][plane][y*refs];
+
+    /* Decomb's cubic interpolation can only function when there are
+       three samples above and below, so regress to yadif's traditional
+       two-tap interpolation when filtering at the top and bottom edges. */
+    int vertical_edge = 0;
+    if( ( y < 3 ) || ( y > ( pv->height[plane] - 4 ) )  )
+        vertical_edge = 1;
+
      for( x = 0; x < w; x++)
      {
-
-#if 0       
-     /* Buggy experimental code for macroblock-by-macrobock comb detection.*/
-        if(plane == 0 && pv->mode == 7)
-        {
-            if( !(x % 8))
-                macroblock_x = x / 8;        
-            
-            if(pv->cc_array[plane][macroblock_x][macroblock_y] < 0 || pv->cc_array[plane][macroblock_x][macroblock_y] > 64)
-            hb_log("[%i][%i] ( %i * %i )macroblock %i x %i is combed: %i", pv->deinterlaced_frames, plane, x, y, macroblock_x, macroblock_y, pv->cc_array[plane][macroblock_x][macroblock_y] );
-            
-            if(pv->cc_array[plane][macroblock_x][macroblock_y] == 0 && pv->cc_array[plane][macroblock_x+1][macroblock_y] == 0 && pv->cc_array[plane][macroblock_x-1][macroblock_y] == 0 && pv->cc_array[plane][macroblock_x][macroblock_y+1] == 0 && pv->cc_array[plane][macroblock_x][macroblock_y-1] == 0 )
-            {
-                dst[0] = cur[0];
-                pv->uncombed_macroblocks++;
-                goto end_of_yadif_filter_pixel;
-            }
-        }
-        pv->combed_macroblocks++;
-#endif
          /* Pixel above*/
          int c              = cur[-refs];
-        /* Temporal average -- the current pixel location in the previous and next fields */
+        /* Temporal average: the current location in the adjacent fields */
          int d              = (prev2[0] + next2[0])>>1;
          /* Pixel below */
          int e              = cur[+refs];
          
-        /* How the current pixel changes from the field before to the field after */
+        /* How the current pixel changes between the adjacent fields */
          int temporal_diff0 = ABS(prev2[0] - next2[0]);
-        /* The average of how much the pixels above and below change from the field before to now. */
+        /* The average of how much the pixels above and below change from the frame before to now. */
          int temporal_diff1 = ( ABS(prev[-refs] - cur[-refs]) + ABS(prev[+refs] - cur[+refs]) ) >> 1;
-        /* The average of how much the pixels above and below change from now to the next field. */
+        /* The average of how much the pixels above and below change from now to the next frame. */
          int temporal_diff2 = ( ABS(next[-refs] - cur[-refs]) + ABS(next[+refs] - cur[+refs]) ) >> 1;
          /* For the actual difference, use the largest of the previous average diffs. */
          int diff           = MAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
-        
-        /* SAD of how the pixel-1, the pixel, and the pixel+1 change from the line above to below. */ 
-        int spatial_score  = ABS(cur[-refs-1] - cur[+refs-1]) + ABS(cur[-refs]-cur[+refs]) +
-                                     ABS(cur[-refs+1] - cur[+refs+1]) - 1;         
+
          int spatial_pred;
-         
-        /* Spatial pred is either a bilinear or cubic vertical interpolation. */
-        if( pv->mode >= 4  )
+        
+        if( eedi2_mode )
          {
-            spatial_pred = cubic_interpolate( cur[-3*refs], cur[-refs], cur[+refs], cur[3*refs] );
+            /* Who needs yadif's spatial predictions when we can have EEDI2's? */
+            spatial_pred = eedi2_guess[0];
+            eedi2_guess++;
          }
-        else
+        else // Yadif spatial interpolation
          {
-            spatial_pred = (c+e)>>1;
-        }
-
-/* EDDI: Edge Directed Deinterlacing Interpolation
-   Uses the Martinez-Lim Line Shift Parametric Modeling algorithm...I think.
-   Checks 4 different slopes to see if there is more similarity along a diagonal
-   than there was vertically. If a diagonal is more similar, then it indicates
-   an edge, so interpolate along that instead of a vertical line, using either
-   linear or cubic interpolation depending on mode. */
-#define YADIF_CHECK(j)\
-        {   int score = ABS(cur[-refs-1+j] - cur[+refs-1-j])\
-                      + ABS(cur[-refs  +j] - cur[+refs  -j])\
-                      + ABS(cur[-refs+1+j] - cur[+refs+1-j]);\
-            if( score < spatial_score ){\
-                spatial_score = score;\
-                if( pv->mode >= 4)\
-                {\
-                    switch(j)\
-                    {\
-                        case -1:\
-                            spatial_pred = cubic_interpolate(cur[-3 * refs - 3], cur[-refs -1], cur[+refs + 1], cur[3* refs + 3] );\
-                        break;\
-                        case -2:\
-                            spatial_pred = cubic_interpolate( ( ( cur[-3*refs - 4] + cur[-refs - 4] ) / 2 ) , cur[-refs -2], cur[+refs + 2], ( ( cur[3*refs + 4] + cur[refs + 4] ) / 2 ) );\
-                        break;\
-                        case 1:\
-                            spatial_pred = cubic_interpolate(cur[-3 * refs +3], cur[-refs +1], cur[+refs - 1], cur[3* refs -3] );\
-                        break;\
-                        case 2:\
-                            spatial_pred = cubic_interpolate(( ( cur[-3*refs + 4] + cur[-refs + 4] ) / 2 ), cur[-refs +2], cur[+refs - 2], ( ( cur[3*refs - 4] + cur[refs - 4] ) / 2 ) );\
-                        break;\
-                    }\
-                }\
-                else\
-                {\
-                    spatial_pred = ( cur[-refs +j] + cur[+refs -j] ) >>1;\
-                }\
-                
-                YADIF_CHECK(-1) YADIF_CHECK(-2) }} }}
-                YADIF_CHECK( 1) YADIF_CHECK( 2) }} }}
-                                
-        /* Temporally adjust the spatial prediction by comparing against the
-           alternate (associated) fields in the previous and next frames. */
+            /* SAD of how the pixel-1, the pixel, and the pixel+1 change from the line above to below. */ 
+            int spatial_score  = ABS(cur[-refs-1] - cur[+refs-1]) + ABS(cur[-refs]-cur[+refs]) +
+                                         ABS(cur[-refs+1] - cur[+refs+1]) - 1;         
+            
+            /* Spatial pred is either a bilinear or cubic vertical interpolation. */
+            if( ( pv->mode & MODE_CUBIC ) && !vertical_edge)
+            {
+                spatial_pred = cubic_interpolate_pixel( cur[-3*refs], cur[-refs], cur[+refs], cur[3*refs] );
+            }
+            else
+            {
+                spatial_pred = (c+e)>>1;
+            }
+
+        /* EDDI: Edge Directed Deinterlacing Interpolation
+           Checks 4 different slopes to see if there is more similarity along a diagonal
+           than there was vertically. If a diagonal is more similar, then it indicates
+           an edge, so interpolate along that instead of a vertical line, using either
+           linear or cubic interpolation depending on mode. */
+        #define YADIF_CHECK(j)\
+                {   int score = ABS(cur[-refs-1+j] - cur[+refs-1-j])\
+                              + ABS(cur[-refs  +j] - cur[+refs  -j])\
+                              + ABS(cur[-refs+1+j] - cur[+refs+1-j]);\
+                    if( score < spatial_score ){\
+                        spatial_score = score;\
+                        if( ( pv->mode & MODE_CUBIC ) && !vertical_edge )\
+                        {\
+                            switch(j)\
+                            {\
+                                case -1:\
+                                    spatial_pred = cubic_interpolate_pixel(cur[-3 * refs - 3], cur[-refs -1], cur[+refs + 1], cur[3* refs + 3] );\
+                                break;\
+                                case -2:\
+                                    spatial_pred = cubic_interpolate_pixel( ( ( cur[-3*refs - 4] + cur[-refs - 4] ) / 2 ) , cur[-refs -2], cur[+refs + 2], ( ( cur[3*refs + 4] + cur[refs + 4] ) / 2 ) );\
+                                break;\
+                                case 1:\
+                                    spatial_pred = cubic_interpolate_pixel(cur[-3 * refs +3], cur[-refs +1], cur[+refs - 1], cur[3* refs -3] );\
+                                break;\
+                                case 2:\
+                                    spatial_pred = cubic_interpolate_pixel(( ( cur[-3*refs + 4] + cur[-refs + 4] ) / 2 ), cur[-refs +2], cur[+refs - 2], ( ( cur[3*refs - 4] + cur[refs - 4] ) / 2 ) );\
+                                break;\
+                            }\
+                        }\
+                        else\
+                        {\
+                            spatial_pred = ( cur[-refs +j] + cur[+refs -j] ) >>1;\
+                        }\
+
+                        if( x >= 2 && x <= w - 3 )
+                        {
+                            YADIF_CHECK(-1)
+                            if( x >= 3 && x <= w - 4 )
+                            {
+                                YADIF_CHECK(-2) }} }}
+                            }
+                        }
+                        if( x >= 2 && x <= w - 3 )
+                        {
+                            YADIF_CHECK(1)
+                            if( x >= 3 && x <= w - 4 )
+                            {
+                                YADIF_CHECK(2) }} }}
+                            }
+                        }
+        }
+
+        /* Temporally adjust the spatial prediction by
+           comparing against lines in the adjacent fields. */
          int b = (prev2[-2*refs] + next2[-2*refs])>>1;
          int f = (prev2[+2*refs] + next2[+2*refs])>>1;
          
@@ -361,7 +1137,6 @@ static void yadif_filter_line( uint8_t *dst,
          
          dst[0] = spatial_pred;
                          
-end_of_yadif_filter_pixel:
          dst++;
          cur++;
          prev++;
@@ -371,151 +1146,321 @@ end_of_yadif_filter_pixel:
      }
  }
  
-static void yadif_filter( uint8_t ** dst,
-                          int parity,
-                          int tff,
-                          hb_filter_private_t * pv )
+/*
+ * deinterlace this segment of all three planes in a single thread.
+ */
+void yadif_decomb_filter_thread( void *thread_args_v )
  {
+    yadif_arguments_t *yadif_work = NULL;
+    hb_filter_private_t * pv;
+    int run = 1;
+    int plane;
+    int segment, segment_start, segment_stop;
+    yadif_thread_arg_t *thread_args = thread_args_v;
+    uint8_t **dst;
+    int parity, tff, y, w, h, penultimate, ultimate, ref_stride, is_combed;
  
-#if 0
-    /* Buggy, experimental code for macroblock-by-macroblock decombing.*/
-    if( pv->mode == 7 )
+    pv = thread_args->pv;
+    segment = thread_args->segment;
+
+    hb_log("yadif thread started for segment %d", segment);
+
+    while( run )
      {
-        int x, y, block_x, block_y, plane, plane_width, plane_height, offset, cc;
-        
-        int stride = 0;
-        int block = 8;
-        int s[16];
-        int color_diff = pv->color_diff;
-        int color_equal = pv->color_equal;
-        
-        if ( pv->buf_settings->flags & 16 )
+        /*
+         * Wait here until there is work to do. hb_lock() blocks until
+         * render releases it to say that there is more work to do.
+         */
+        hb_lock( pv->yadif_begin_lock[segment] );
+
+        yadif_work = &pv->yadif_arguments[segment];
+
+        if( yadif_work->stop )
          {
-            /* Frame is progressive, be more discerning. */
-            color_diff = pv->prog_diff;
-            color_equal = pv->prog_equal;
+            /*
+             * No more work to do, exit this thread.
+             */
+            run = 0;
+            continue;
+        } 
+
+        if( yadif_work->dst == NULL )
+        {
+            hb_error( "thread started when no work available" );
+            hb_snooze(500);
+            continue;
          }
          
-        /* Iterate through planes */
-        for( plane = 0; plane < 1; plane++ )
-        {   
-            plane_width =  pv->width[plane];
-            plane_height = pv->height[plane];
-        
-            if( plane == 1 )
-            {
-                /* Y has already been checked, now offset by Y's dimensions
-                   and divide all the other values by 2, since Cr and Cb
-                   are half-size compared to Y.                               */
-                stride = plane_width * plane_height;
-            }
-            else if ( plane == 2 )
+        is_combed = pv->yadif_arguments[segment].is_combed;
+
+        /*
+         * Process all three planes, but only this segment of it.
+         */
+        for( plane = 0; plane < 3; plane++)
+        {
+
+            dst = yadif_work->dst;
+            parity = yadif_work->parity;
+            tff = yadif_work->tff;
+            w = pv->width[plane];
+            h = pv->height[plane];
+            penultimate = h - 2;
+            ultimate = h - 1;
+            ref_stride = pv->ref_stride[plane];
+            segment_start = ( h / pv->cpu_count ) * segment;
+            if( segment == pv->cpu_count - 1 )
              {
-                /* Y and Cb are done, so the offset needs to be bumped
-                   so it's width*height + (width / 2) * (height / 2)  */
-                stride *= 5/4;
+                /*
+                 * Final segment
+                 */
+                segment_stop = h;
+            } else {
+                segment_stop = ( h / pv->cpu_count ) * ( segment + 1 );
              }
-            /* Grab a horizontal line */
-            for(y = 0; y < plane_height; y += block )
-            {
-                uint8_t *line = &pv->ref[1][plane][ y*plane_width ];
  
-                /* Iterate through it horizontally in blocks */
-                for(x = 0; x < plane_width; x += block)
+            for( y = segment_start; y < segment_stop; y++ )
+            {
+                if( is_combed == 2 )
                  {
-                    /* Clear out the current macroblock mapping from the last frame. */
-                    pv->cc_array[plane][x/block][y/block] = 0;
-                    int sadA = 0;
-                    int sadB = 0;
+                    /* This line gets blend filtered, not yadif filtered. */
+                    uint8_t *cur  = &pv->ref[1][plane][y*ref_stride];
+                    uint8_t *dst2 = &dst[plane][y*w];
+                    /* These will be useful if we ever do temporal blending. */
+                    // uint8_t *prev = &pv->ref[0][plane][y*ref_stride];
+                    // uint8_t *next = &pv->ref[2][plane][y*ref_stride];
+
+                    blend_filter_line( dst2, cur, plane, y, pv );
+                }
+                else if( pv->mode == MODE_CUBIC && is_combed && ( ( y ^ parity ) & 1 ) )
+                {
+                    /* Just apply vertical cubic interpolation */
+                    uint8_t *cur  = &pv->ref[1][plane][y*ref_stride];
+                    uint8_t *dst2 = &dst[plane][y*w];
                      
-                    /* Go through the block horizontally */
-                    for(block_x = 0; block_x < block; block_x++)
+                    cubic_interpolate_line( dst2, cur, plane, y, pv );
+                }
+                else if( pv->mode & MODE_YADIF && ( ( y ^ parity ) &  1 )  && ( is_combed == 1 ) )
+                {
+                    /* This line gets yadif filtered. It is the bottom field
+                       when TFF and vice-versa. It's the field that gets
+                       filtered. Because yadif needs 2 lines above and below
+                       the one being filtered, we need to mirror the edges.
+                       When TFF, this means replacing the 2nd line with a
+                       copy of the 1st, and the last with the second-to-last. */
+                    if( y > 1 && y < ( h -2 ) )
                      {
-                        /* Go through the block vertically, collecting pixels */
-                        for(block_y = 0; block_y < block*2; block_y++)
-                        {
-                            s[block_y] = line[x+block_x+(block_y*plane_width)];
-                        }
-
-                        /* Now go through the results to check combing. */
-                        for(block_y = 0; block_y < block; block_y++)
-                        {
-                            sadA += abs(s[block_y] - s[block_y+2]);
-                            sadB += abs(s[block_y] - s[block_y+1]);
-                            
-//                            if( abs(s[block_y] - s[block_y+2]) < color_equal && abs(s[block_y] - s[block_y+1]) > color_diff)
-//                            {
-//                                pv->cc_array[plane][x/block][y/block]++;
-//                            }
-                        }
+                        /* This isn't the top or bottom, proceed as normal to yadif. */
+                        uint8_t *prev = &pv->ref[0][plane][y*ref_stride];
+                        uint8_t *cur  = &pv->ref[1][plane][y*ref_stride];
+                        uint8_t *next = &pv->ref[2][plane][y*ref_stride];
+                        uint8_t *dst2 = &dst[plane][y*w];
+
+                        yadif_filter_line( dst2, 
+                                           prev, 
+                                           cur, 
+                                           next, 
+                                           plane, 
+                                           parity ^ tff,
+                                           y, 
+                                           pv );
+                    }
+                    else if( y == 0 )
+                    {
+                        /* BFF, so y0 = y1 */
+                        memcpy( &dst[plane][y*w],
+                                &pv->ref[1][plane][1*ref_stride],
+                                w * sizeof(uint8_t) );
+                    }
+                    else if( y == 1 )
+                    {
+                        /* TFF, so y1 = y0 */
+                        memcpy( &dst[plane][y*w],
+                                &pv->ref[1][plane][0],
+                                w * sizeof(uint8_t) );
                      }
-                    
-                    if(sadA < sadB)
+                    else if( y == penultimate )
                      {
-                        pv->cc_array[plane][x/block][y/block] = 1;
+                        /* BFF, so penultimate y = ultimate y */
+                        memcpy( &dst[plane][y*w],
+                                &pv->ref[1][plane][ultimate*ref_stride],
+                                w * sizeof(uint8_t) );
                      }
-                    
+                    else if( y == ultimate )
+                    {
+                        /* TFF, so ultimate y = penultimate y */
+                        memcpy( &dst[plane][y*w],
+                                &pv->ref[1][plane][penultimate*ref_stride],
+                                w * sizeof(uint8_t) );
+                    }
+                }
+                else
+                {
+                    memcpy( &dst[plane][y*w],
+                            &pv->ref[1][plane][y*ref_stride],
+                            w * sizeof(uint8_t) );              
                  }
-            }        
+            }
          }
+        /*
+         * Finished this segment, let everyone know.
+         */
+        hb_unlock( pv->yadif_complete_lock[segment] );
      }
+    free( thread_args_v );
+}
  
-#if 0
-/* Visualize macroblocks */    
-    int x, y;
-    fprintf(stderr, "FRAME %i VISUALIZATION\n", pv->deinterlaced_frames);
-    for( y = 0; y < 60; y++ )
+static void yadif_filter( uint8_t ** dst,
+                          int parity,
+                          int tff,
+                          hb_filter_private_t * pv )
+{
+    /* If we're running comb detection, do it now, otherwise default to true. */
+    int is_combed = pv->spatial_metric >= 0 ? comb_segmenter( pv ) : 1;
+    
+    /* The comb detector suggests three different values:
+       0: Don't comb this frame.
+       1: Deinterlace this frame.
+       2: Blend this frame.
+       Since that might conflict with the filter's mode,
+       it may be necesary to adjust this value.          */
+    if( is_combed == 1 && (pv->mode == MODE_BLEND) )
      {
-        for( x = 0; x < 90; x++ )
-        {
-            if(pv->cc_array[0][x][y])
-                fprintf(stderr, "X");
-            else
-                fprintf(stderr, "O");
-                
-        }
-        fprintf(stderr, "\n");
+        /* All combed frames are getting blended */
+        is_combed = 2;
      }
-    fprintf(stderr, "\n\n");
-#endif
-#endif
-
-    int i;
-    for( i = 0; i < 3; i++ )
+    else if( is_combed == 2 && !( pv->mode & MODE_BLEND ) )
      {
-        int w = pv->width[i];
-        int h = pv->height[i];
-        int ref_stride = pv->ref_stride[i];        
-        
-        int y;
-        for( y = 0; y < h; y++ )
+        /* Blending is disabled, so force interpolation of these frames. */
+        is_combed = 1;
+    }
+    if( is_combed == 1 &&
+        ( pv->mode & MODE_BLEND ) &&
+        !( pv->mode & ( MODE_YADIF | MODE_EEDI2 | MODE_CUBIC ) ) )
+    {
+        /* Deinterlacers are disabled, blending isn't, so blend these frames. */
+        is_combed = 2;
+    }
+    else if( is_combed &&
+             !( pv->mode & ( MODE_BLEND | MODE_YADIF | MODE_EEDI2 | MODE_CUBIC | MODE_MASK ) ) )
+    {
+        /* No deinterlacer or mask chosen, pass the frame through. */
+        is_combed = 0;
+    }
+    
+    if( is_combed == 1 )
+    {
+        pv->deinterlaced_frames++;
+    }
+    else if( is_combed == 2 )
+    {
+        pv->blended_frames++;
+    }
+    else
+    {
+        pv->unfiltered_frames++;
+    }
+    
+    if( is_combed == 1 && ( pv->mode & MODE_EEDI2 ) )
+    {
+        /* Generate an EEDI2 interpolation */
+        eedi2_planer( pv );
+    }
+    
+    if( is_combed )
+    {
+        if( ( pv->mode & MODE_EEDI2 ) && !( pv->mode & MODE_YADIF ) && is_combed == 1 )
          {
-            if( pv->mode == 3)
+            // Just pass through the EEDI2 interpolation
+            int i;
+            for( i = 0; i < 3; i++ )
              {
-                uint8_t *prev = &pv->ref[0][i][y*ref_stride];
-                uint8_t *cur  = &pv->ref[1][i][y*ref_stride];
-                uint8_t *next = &pv->ref[2][i][y*ref_stride];
-                uint8_t *dst2 = &dst[i][y*w];
+                uint8_t * ref = pv->eedi_full[DST2PF][i];
+                uint8_t * dest = dst[i];
+
+                int w = pv->width[i];
+                int ref_stride = pv->ref_stride[i];
  
-                blend_filter_line( dst2, cur, i, y, pv );
+                int y;
+                for( y = 0; y < pv->height[i]; y++ )
+                {
+                    memcpy(dest, ref, w);
+                    dest += w;
+                    ref += ref_stride;
+                }
+            }
+        }
+        else
+        {
+            int segment;
+
+            for( segment = 0; segment < pv->cpu_count; segment++ )
+            {  
+                /*
+                 * Setup the work for this plane.
+                 */
+                pv->yadif_arguments[segment].parity = parity;
+                pv->yadif_arguments[segment].tff = tff;
+                pv->yadif_arguments[segment].dst = dst;
+                pv->yadif_arguments[segment].is_combed = is_combed;
+
+                /*
+                 * Let the thread for this plane know that we've setup work 
+                 * for it by releasing the begin lock (ensuring that the
+                 * complete lock is already locked so that we block when
+                 * we try to lock it again below).
+                 */
+                hb_lock( pv->yadif_complete_lock[segment] );
+                hb_unlock( pv->yadif_begin_lock[segment] );
              }
-            else if( (y ^ parity) &  1 )
-            {
-                uint8_t *prev = &pv->ref[0][i][y*ref_stride];
-                uint8_t *cur  = &pv->ref[1][i][y*ref_stride];
-                uint8_t *next = &pv->ref[2][i][y*ref_stride];
-                uint8_t *dst2 = &dst[i][y*w];
  
-                yadif_filter_line( dst2, prev, cur, next, i, parity ^ tff, y, pv );
+            /*
+             * Wait until all three threads have completed by trying to get
+             * the complete lock that we locked earlier for each thread, which
+             * will block until that thread has completed the work on that
+             * plane.
+             */
+            for( segment = 0; segment < pv->cpu_count; segment++ )
+            {
+                hb_lock( pv->yadif_complete_lock[segment] );
+                hb_unlock( pv->yadif_complete_lock[segment] );
              }
-            else
+
+            /*
+             * Entire frame is now deinterlaced.
+             */
+        }
+    }
+    else
+    {
+        /*  Just passing through... */
+        
+        /* For mcdeint's benefit... */
+        pv->yadif_arguments[0].is_combed = is_combed; // 0
+        
+        int i;
+        for( i = 0; i < 3; i++ )
+        {
+            uint8_t * ref = pv->ref[1][i];
+            uint8_t * dest = dst[i];
+            
+            int w = pv->width[i];
+            int ref_stride = pv->ref_stride[i];
+            
+            int y;
+            for( y = 0; y < pv->height[i]; y++ )
              {
-                memcpy( &dst[i][y*w],
-                        &pv->ref[1][i][y*ref_stride],
-                        w * sizeof(uint8_t) );              
+                memcpy(dest, ref, w);
+                dest += w;
+                ref += ref_stride;
              }
          }
      }
+    
+    if( pv->mode & MODE_MASK && pv->spatial_metric >= 0 )
+    {
+        if( pv->mode == MODE_MASK || is_combed )
+        apply_mask( pv );
+    }
  }
  
  static void mcdeint_filter( uint8_t ** dst,
@@ -561,7 +1506,7 @@ static void mcdeint_filter( uint8_t ** dst,
              {
                  for( x = 0; x < w; x++ )
                  {
-                    if( (x-2)+(y-1)*w >= 0 && (x+2)+(y+1)*w < w*h )
+                    if( (x-1)+(y-1)*w >= 0 && (x+1)+(y+1)*w < w*h )
                      {
                          uint8_t * filp =
                              &pv->mcdeint_frame_dec->data[i][x + y*fils];
@@ -569,11 +1514,12 @@ static void mcdeint_filter( uint8_t ** dst,
  
                          int diff0 = filp[-fils] - srcp[-srcs];
                          int diff1 = filp[+fils] - srcp[+srcs];
-
-                        int spatial_score =
-                              ABS(srcp[-srcs-1] - srcp[+srcs-1])
-                            + ABS(srcp[-srcs  ] - srcp[+srcs  ])
-                            + ABS(srcp[-srcs+1] - srcp[+srcs+1]) - 1;
+                        int spatial_score;
+                        
+                        spatial_score =
+                            ABS(srcp[-srcs-1] - srcp[+srcs-1]) +
+                            ABS(srcp[-srcs  ] - srcp[+srcs  ]) +
+                            ABS(srcp[-srcs+1] - srcp[+srcs+1]) - 1;
  
                          int temp = filp[0];
  
@@ -586,8 +1532,22 @@ static void mcdeint_filter( uint8_t ** dst,
                                  diff0 = filp[-fils+j] - srcp[-srcs+j];\
                                  diff1 = filp[+fils-j] - srcp[+srcs-j];
  
-                        MCDEINT_CHECK(-1) MCDEINT_CHECK(-2) }} }}
-                        MCDEINT_CHECK( 1) MCDEINT_CHECK( 2) }} }}
+                        if( x >= 2 && x <= w - 3 )
+                        {
+                            MCDEINT_CHECK(-1)
+                            if( x >= 3 && x <= w - 4 )
+                            {
+                                MCDEINT_CHECK(-2) }} }}
+                            }
+                        }
+                        if( x >= 2 && x <= w - 3 )
+                        {
+                            MCDEINT_CHECK(1)
+                            if( x >= 3 && x <= w - 4 )
+                            {
+                                MCDEINT_CHECK(2) }} }}
+                            }
+                        }
  
                          if(diff0 + diff1 > 0)
                          {
@@ -610,11 +1570,7 @@ static void mcdeint_filter( uint8_t ** dst,
                      }
                  }
              }
-        }
-
-        for( y = 0; y < h; y++ )
-        {
-            if( !((y ^ parity) & 1) )
+            else
              {
                  for( x = 0; x < w; x++ )
                  {
@@ -650,25 +1606,33 @@ hb_filter_private_t * hb_decomb_init( int pix_fmt,
      pv->width[1]  = pv->width[2]  = width >> 1;
      pv->height[1] = pv->height[2] = height >> 1;
  
-    int buf_size = 3 * width * height / 2;
-    pv->buf_out[0] = hb_buffer_init( buf_size );
-    pv->buf_out[1] = hb_buffer_init( buf_size );
+    pv->buf_out[0] = hb_video_buffer_init( width, height );
+    pv->buf_out[1] = hb_video_buffer_init( width, height );
      pv->buf_settings = hb_buffer_init( 0 );
  
      pv->deinterlaced_frames = 0;
-    pv->passed_frames = 0;
-    pv->color_equal = 10;
-    pv->color_diff = 15;
-    pv->threshold = 9;
-    pv->prog_equal = 10;
-    pv->prog_diff = 35;
-    pv->prog_threshold = 9;
-    
-    pv->combed_macroblocks = 0;
-    pv->uncombed_macroblocks = 0;
-    
+    pv->blended_frames = 0;
+    pv->unfiltered_frames = 0;
+
      pv->yadif_ready    = 0;
-    pv->mode     = MODE_DEFAULT;
+
+    pv->mode     = MODE_YADIF | MODE_BLEND | MODE_CUBIC;
+    pv->spatial_metric = 2;
+    pv->motion_threshold = 6;
+    pv->spatial_threshold = 9;
+    pv->block_threshold = 80;
+    pv->block_width = 16;
+    pv->block_height = 16;
+    
+    pv->magnitude_threshold = 10;
+    pv->variance_threshold = 20;
+    pv->laplacian_threshold = 20;
+    pv->dilation_threshold = 4;
+    pv->erosion_threshold = 2;
+    pv->noise_threshold = 50;
+    pv->maximum_search_distance = 24;
+    pv->post_processing = 1;
+
      pv->parity   = PARITY_DEFAULT;
  
      pv->mcdeint_mode   = MCDEINT_MODE_DEFAULT;
@@ -676,48 +1640,236 @@ hb_filter_private_t * hb_decomb_init( int pix_fmt,
  
      if( settings )
      {
-        sscanf( settings, "%d:%d:%d:%d:%d:%d:%d",
+        sscanf( settings, "%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d",
                  &pv->mode,
-                &pv->color_equal,
-                &pv->color_diff,
-                &pv->threshold,
-                &pv->prog_equal,
-                &pv->prog_diff,
-                &pv->prog_threshold );
+                &pv->spatial_metric,
+                &pv->motion_threshold,
+                &pv->spatial_threshold,
+                &pv->block_threshold,
+                &pv->block_width,
+                &pv->block_height,
+                &pv->magnitude_threshold,
+                &pv->variance_threshold,
+                &pv->laplacian_threshold,
+                &pv->dilation_threshold,
+                &pv->erosion_threshold,
+                &pv->noise_threshold,
+                &pv->maximum_search_distance,
+                &pv->post_processing,
+                &pv->parity );
      }
      
-    if( pv->mode == 2 || pv->mode == 5 )
+    pv->cpu_count = hb_get_cpu_count();
+    
+
+    if( pv->mode & MODE_MCDEINT )
      {
-        pv->mcdeint_mode = 0;
+        pv->mcdeint_mode = 2;
      }
      
      /* Allocate yadif specific buffers */
-    if( pv->mode > 0 )
+    int i, j;
+    for( i = 0; i < 3; i++ )
+    {
+        int is_chroma = !!i;
+        int w = ((width   + 31) & (~31))>>is_chroma;
+        int h = ((height+6+ 31) & (~31))>>is_chroma;
+
+        pv->ref_stride[i] = w;
+
+        for( j = 0; j < 3; j++ )
+        {
+            pv->ref[j][i] = calloc( 1, w*h*sizeof(uint8_t) ) + 3*w;
+        }
+    }
+
+    /* Allocate a buffer to store a comb mask. */
+    for( i = 0; i < 3; i++ )
+    {
+        int is_chroma = !!i;
+        int w = ((pv->width[0]   + 31) & (~31))>>is_chroma;
+        int h = ((pv->height[0]+6+ 31) & (~31))>>is_chroma;
+
+        pv->mask[i] = calloc( 1, w*h*sizeof(uint8_t) ) + 3*w;
+    }
+    
+    if( pv->mode & MODE_EEDI2 )
      {
-        int i, j;
+        /* Allocate half-height eedi2 buffers */
+        height = pv->height[0] / 2;
          for( i = 0; i < 3; i++ )
          {
              int is_chroma = !!i;
              int w = ((width   + 31) & (~31))>>is_chroma;
              int h = ((height+6+ 31) & (~31))>>is_chroma;
  
-            pv->ref_stride[i] = w;
+            for( j = 0; j < 4; j++ )
+            {
+                pv->eedi_half[j][i] = calloc( 1, w*h*sizeof(uint8_t) ) + 3*w;
+            }
+        }
+
+        /* Allocate full-height eedi2 buffers */
+        height = pv->height[0];
+        for( i = 0; i < 3; i++ )
+        {
+            int is_chroma = !!i;
+            int w = ((width   + 31) & (~31))>>is_chroma;
+            int h = ((height+6+ 31) & (~31))>>is_chroma;
  
-            for( j = 0; j < 3; j++ )
+            for( j = 0; j < 5; j++ )
              {
-                pv->ref[j][i] = malloc( w*h*sizeof(uint8_t) ) + 3*w;
+                pv->eedi_full[j][i] = calloc( 1, w*h*sizeof(uint8_t) ) + 3*w;
              }
          }
      }
+    
+     /*
+      * Create yadif threads and locks.
+      */
+     pv->yadif_threads = malloc( sizeof( hb_thread_t* ) * pv->cpu_count );
+     pv->yadif_begin_lock = malloc( sizeof( hb_lock_t * ) * pv->cpu_count );
+     pv->yadif_complete_lock = malloc( sizeof( hb_lock_t * ) * pv->cpu_count );
+     pv->yadif_arguments = malloc( sizeof( yadif_arguments_t ) * pv->cpu_count );
+
+     for( i = 0; i < pv->cpu_count; i++ )
+     {
+         yadif_thread_arg_t *thread_args;
+
+         thread_args = malloc( sizeof( yadif_thread_arg_t ) );
+
+         if( thread_args )
+         {
+             thread_args->pv = pv;
+             thread_args->segment = i;
+
+             pv->yadif_begin_lock[i] = hb_lock_init();
+             pv->yadif_complete_lock[i] = hb_lock_init();
+
+             /*
+              * Important to start off with the threads locked waiting
+              * on input.
+              */
+             hb_lock( pv->yadif_begin_lock[i] );
+
+             pv->yadif_arguments[i].stop = 0;
+             pv->yadif_arguments[i].dst = NULL;
+             
+             pv->yadif_threads[i] = hb_thread_init( "yadif_filter_segment",
+                                                    yadif_decomb_filter_thread,
+                                                    thread_args,
+                                                    HB_NORMAL_PRIORITY );
+         }
+         else
+         {
+             hb_error( "yadif could not create threads" );
+         }
+    }
+    
+    /*
+     * Create decomb threads and locks.
+     */
+    pv->decomb_threads = malloc( sizeof( hb_thread_t* ) * pv->cpu_count );
+    pv->decomb_begin_lock = malloc( sizeof( hb_lock_t * ) * pv->cpu_count );
+    pv->decomb_complete_lock = malloc( sizeof( hb_lock_t * ) * pv->cpu_count );
+    pv->decomb_arguments = malloc( sizeof( decomb_arguments_t ) * pv->cpu_count );
+    
+    for( i = 0; i < pv->cpu_count; i++ )
+    {
+        decomb_thread_arg_t *decomb_thread_args;
+    
+        decomb_thread_args = malloc( sizeof( decomb_thread_arg_t ) );
+    
+        if( decomb_thread_args )
+        {
+            decomb_thread_args->pv = pv;
+            decomb_thread_args->segment = i;
+    
+            pv->decomb_begin_lock[i] = hb_lock_init();
+            pv->decomb_complete_lock[i] = hb_lock_init();
+    
+            /*
+             * Important to start off with the threads locked waiting
+             * on input.
+             */
+            hb_lock( pv->decomb_begin_lock[i] );
+    
+            pv->decomb_arguments[i].stop = 0;
+    
+            pv->decomb_threads[i] = hb_thread_init( "decomb_filter_segment",
+                                                   decomb_filter_thread,
+                                                   decomb_thread_args,
+                                                   HB_NORMAL_PRIORITY );
+        }
+        else
+        {
+            hb_error( "decomb could not create threads" );
+        }
+    }
+    
+    if( pv->mode & MODE_EEDI2 )
+    {
+        /*
+         * Create eedi2 threads and locks.
+         */
+        pv->eedi2_threads = malloc( sizeof( hb_thread_t* ) * 3 );
+        pv->eedi2_begin_lock = malloc( sizeof( hb_lock_t * ) * 3 );
+        pv->eedi2_complete_lock = malloc( sizeof( hb_lock_t * ) * 3 );
+        pv->eedi2_arguments = malloc( sizeof( eedi2_arguments_t ) * 3 );
+
+        if( pv->post_processing > 1 )
+        {
+            pv->cx2 = (int*)eedi2_aligned_malloc(pv->height[0]*pv->ref_stride[0]*sizeof(int), 16);
+            pv->cy2 = (int*)eedi2_aligned_malloc(pv->height[0]*pv->ref_stride[0]*sizeof(int), 16);
+            pv->cxy = (int*)eedi2_aligned_malloc(pv->height[0]*pv->ref_stride[0]*sizeof(int), 16);
+            pv->tmpc = (int*)eedi2_aligned_malloc(pv->height[0]*pv->ref_stride[0]*sizeof(int), 16);
+            if( !pv->cx2 || !pv->cy2 || !pv->cxy || !pv->tmpc )
+                hb_log("EEDI2: failed to malloc derivative arrays");
+            else
+                hb_log("EEDI2: successfully mallloced derivative arrays");
+        }
+
+        for( i = 0; i < 3; i++ )
+        {
+            eedi2_thread_arg_t *eedi2_thread_args;
+
+            eedi2_thread_args = malloc( sizeof( eedi2_thread_arg_t ) );
+
+            if( eedi2_thread_args )
+            {
+                eedi2_thread_args->pv = pv;
+                eedi2_thread_args->plane = i;
+
+                pv->eedi2_begin_lock[i] = hb_lock_init();
+                pv->eedi2_complete_lock[i] = hb_lock_init();
  
+                /*
+                 * Important to start off with the threads locked waiting
+                 * on input.
+                 */
+                hb_lock( pv->eedi2_begin_lock[i] );
+
+                pv->eedi2_arguments[i].stop = 0;
+
+                pv->eedi2_threads[i] = hb_thread_init( "eedi2_filter_segment",
+                                                       eedi2_filter_thread,
+                                                       eedi2_thread_args,
+                                                       HB_NORMAL_PRIORITY );
+            }
+            else
+            {
+                hb_error( "eedi2 could not create threads" );
+            }
+        }
+    }
+    
+    
      /* Allocate mcdeint specific buffers */
      if( pv->mcdeint_mode >= 0 )
      {
          avcodec_init();
          avcodec_register_all();
-
          AVCodec * enc = avcodec_find_encoder( CODEC_ID_SNOW );
-
          int i;
          for (i = 0; i < 3; i++ )
          {
@@ -744,7 +1896,7 @@ hb_filter_private_t * hb_decomb_init( int pix_fmt,
                  case 3:
                      avctx_enc->refs = 3;
                  case 2:
-                    avctx_enc->me_method = ME_UMH;
+                    avctx_enc->me_method = ME_ITER;
                  case 1:
                      avctx_enc->flags |= CODEC_FLAG_4MV;
                      avctx_enc->dia_size =2;
@@ -752,7 +1904,7 @@ hb_filter_private_t * hb_decomb_init( int pix_fmt,
                      avctx_enc->flags |= CODEC_FLAG_QPEL;
              }
  
-            avcodec_open(avctx_enc, enc);
+            hb_avcodec_open(avctx_enc, enc);
          }
  
          pv->mcdeint_frame       = avcodec_alloc_frame();
@@ -770,14 +1922,7 @@ void hb_decomb_close( hb_filter_private_t * pv )
          return;
      }
      
-    if( pv->mode < 7 )
-    {
-        hb_log("decomb: deinterlaced %i | unfiltered %i | total %i", pv->deinterlaced_frames, pv->passed_frames, pv->deinterlaced_frames + pv->passed_frames);
-    }
-    else
-    {
-        hb_log("decomb macroblock: deinterlaced: %i | unfiltered %i | total %i", pv->combed_macroblocks, pv->uncombed_macroblocks, pv->combed_macroblocks + pv->uncombed_macroblocks);
-    }
+    hb_log("decomb: deinterlaced %i | blended %i | unfiltered %i | total %i", pv->deinterlaced_frames, pv->blended_frames, pv->unfiltered_frames, pv->deinterlaced_frames + pv->blended_frames + pv->unfiltered_frames);
  
      /* Cleanup frame buffers */
      if( pv->buf_out[0] )
@@ -794,26 +1939,140 @@ void hb_decomb_close( hb_filter_private_t * pv )
      }
  
      /* Cleanup yadif specific buffers */
-    if( pv->mode > 0 )
+    int i;
+    for( i = 0; i<3*3; i++ )
      {
-        int i;
-        for( i = 0; i<3*3; i++ )
+        uint8_t **p = &pv->ref[i%3][i/3];
+        if (*p)
+        {
+            free( *p - 3*pv->ref_stride[i/3] );
+            *p = NULL;
+        }
+    }
+    
+    /* Cleanup combing mask. */
+    for( i = 0; i<3*3; i++ )
+    {
+        uint8_t **p = &pv->mask[i/3];
+        if (*p)
+        {
+            free( *p - 3*pv->ref_stride[i/3] );
+            *p = NULL;
+        }
+    }
+    
+    if( pv->mode & MODE_EEDI2 )
+    {
+        /* Cleanup eedi-half  buffers */
+        int j;
+        for( i = 0; i<3; i++ )
+        {
+            for( j = 0; j < 4; j++ )
+            {
+                uint8_t **p = &pv->eedi_half[j][i];
+                if (*p)
+                {
+                    free( *p - 3*pv->ref_stride[i] );
+                    *p = NULL;
+                }            
+            }
+        }
+
+        /* Cleanup eedi-full  buffers */
+        for( i = 0; i<3; i++ )
          {
-            uint8_t **p = &pv->ref[i%3][i/3];
-            if (*p)
+            for( j = 0; j < 5; j++ )
              {
-                free( *p - 3*pv->ref_stride[i/3] );
-                *p = NULL;
+                uint8_t **p = &pv->eedi_full[j][i];
+                if (*p)
+                {
+                    free( *p - 3*pv->ref_stride[i] );
+                    *p = NULL;
+                }            
              }
          }
      }
+    
+    if( pv->post_processing > 1  && ( pv->mode & MODE_EEDI2 ) )
+    {
+        if (pv->cx2) eedi2_aligned_free(pv->cx2);
+        if (pv->cy2) eedi2_aligned_free(pv->cy2);
+        if (pv->cxy) eedi2_aligned_free(pv->cxy);
+        if (pv->tmpc) eedi2_aligned_free(pv->tmpc);
+    }
+    
+    for( i = 0; i < pv->cpu_count; i++)
+    {
+        /*
+         * Tell each yadif thread to stop, and then cleanup.
+         */
+        pv->yadif_arguments[i].stop = 1;
+        hb_unlock(  pv->yadif_begin_lock[i] );
+
+        hb_thread_close( &pv->yadif_threads[i] );
+        hb_lock_close( &pv->yadif_begin_lock[i] );
+        hb_lock_close( &pv->yadif_complete_lock[i] );
+    }
+    
+    /*
+     * free memory for yadif structs
+     */
+    free( pv->yadif_threads );
+    free( pv->yadif_begin_lock );
+    free( pv->yadif_complete_lock );
+    free( pv->yadif_arguments );
+    
+    for( i = 0; i < pv->cpu_count; i++)
+    {
+        /*
+         * Tell each decomb thread to stop, and then cleanup.
+         */
+        pv->decomb_arguments[i].stop = 1;
+        hb_unlock(  pv->decomb_begin_lock[i] );
+
+        hb_thread_close( &pv->decomb_threads[i] );
+        hb_lock_close( &pv->decomb_begin_lock[i] );
+        hb_lock_close( &pv->decomb_complete_lock[i] );
+    }
+    
+    /*
+     * free memory for decomb structs
+     */
+    free( pv->decomb_threads );
+    free( pv->decomb_begin_lock );
+    free( pv->decomb_complete_lock );
+    free( pv->decomb_arguments );
+    
+    if( pv->mode & MODE_EEDI2 )
+    {
+        for( i = 0; i < 3; i++)
+        {
+            /*
+             * Tell each eedi2 thread to stop, and then cleanup.
+             */
+            pv->eedi2_arguments[i].stop = 1;
+            hb_unlock(  pv->eedi2_begin_lock[i] );
+
+            hb_thread_close( &pv->eedi2_threads[i] );
+            hb_lock_close( &pv->eedi2_begin_lock[i] );
+            hb_lock_close( &pv->eedi2_complete_lock[i] );
+        }
  
+        /*
+         * free memory for eedi2 structs
+         */
+        free( pv->eedi2_threads );
+        free( pv->eedi2_begin_lock );
+        free( pv->eedi2_complete_lock );
+        free( pv->eedi2_arguments );
+    }
+    
      /* Cleanup mcdeint specific buffers */
      if( pv->mcdeint_mode >= 0 )
      {
          if( pv->mcdeint_avctx_enc )
          {
-            avcodec_close( pv->mcdeint_avctx_enc );
+            hb_avcodec_close( pv->mcdeint_avctx_enc );
              av_freep( &pv->mcdeint_avctx_enc );
          }
          if( pv->mcdeint_outbuf )
@@ -825,13 +2084,15 @@ void hb_decomb_close( hb_filter_private_t * pv )
      free( pv );
  }
  
-int hb_decomb_work( hb_buffer_t * buf_in,
-                         hb_buffer_t ** buf_out,
-                         int pix_fmt,
-                         int width,
-                         int height,
-                         hb_filter_private_t * pv )
+int hb_decomb_work( const hb_buffer_t * cbuf_in,
+                    hb_buffer_t ** buf_out,
+                    int pix_fmt,
+                    int width,
+                    int height,
+                    hb_filter_private_t * pv )
  {
+    hb_buffer_t * buf_in = (hb_buffer_t *)cbuf_in;
+
      if( !pv ||
          pix_fmt != pv->pix_fmt ||
          width   != pv->width[0] ||
@@ -843,40 +2104,6 @@ int hb_decomb_work( hb_buffer_t * buf_in,
      avpicture_fill( &pv->pic_in, buf_in->data,
                      pix_fmt, width, height );
  
-    /* Use libavcodec deinterlace if mode == 0 */
-    if( pv->mode == 0 )
-    {
-        avpicture_fill( &pv->pic_out, pv->buf_out[0]->data,
-                        pix_fmt, width, height );
-
-        /* Check for combing on the input frame */
-        int interlaced =  hb_detect_comb(buf_in, width, height, pv->color_equal, pv->color_diff, pv->threshold, pv->prog_equal, pv->prog_diff, pv->prog_threshold);
-        
-        if(interlaced)
-        {
-            avpicture_deinterlace( &pv->pic_out, &pv->pic_in,
-                                   pix_fmt, width, height );
-
-            pv->deinterlaced_frames++;
-            //hb_log("Frame %i is combed (Progressive: %s )", pv->deinterlaced_frames + pv->passed_frames, (buf_in->flags & 16) ? "Y" : "N");
-            
-            hb_buffer_copy_settings( pv->buf_out[0], buf_in );
-            *buf_out = pv->buf_out[0];            
-        }
-        else
-        {
-            /* No combing detected, pass input frame through unmolested.*/
-            
-            pv->passed_frames++;
-            
-            hb_buffer_copy_settings( pv->buf_out[0], buf_in );
-            *buf_out = buf_in;
-            
-        }
-
-        return FILTER_OK;
-    }
-    
      /* Determine if top-field first layout */
      int tff;
      if( pv->parity < 0 )
@@ -890,12 +2117,6 @@ int hb_decomb_work( hb_buffer_t * buf_in,
  
      /* Store current frame in yadif cache */
      store_ref( (const uint8_t**)pv->pic_in.data, pv );
-    
-    if( pv->mode < 7 )
-    {
-        /* Note down if the input frame is combed */
-        pv->comb = (pv->comb << 1) | hb_detect_comb(buf_in, width, height, pv->color_equal, pv->color_diff, pv->threshold, pv->prog_equal, pv->prog_diff, pv->prog_threshold);
-    }
  
      /* If yadif is not ready, store another ref and return FILTER_DELAY */
      if( pv->yadif_ready == 0 )
@@ -912,71 +2133,59 @@ int hb_decomb_work( hb_buffer_t * buf_in,
          return FILTER_DELAY;
      }
  
-    /* yadif works one frame behind so if the previous frame
-     * had combing, deinterlace it otherwise just output it. */
-    if( pv->mode == 7 ) // Experimental for macroblock decombing
+    /* Perform yadif filtering */        
+    int frame;
+    for( frame = 0; frame <= ( ( pv->mode & MODE_MCDEINT ) ? 1 : 0 ) ; frame++ )
+// This would be what to use for bobbing: for( frame = 0; frame <= 0 ; frame++ )
      {
-        /* Perform yadif filtering */
-        
-        pv->deinterlaced_frames++;
-        int frame;
-        for( frame = 0; frame <= ( ( pv->mode == 2 || pv->mode == 5 )? 1 : 0 ) ; frame++ )
-        {
-            int parity = frame ^ tff ^ 1;
-
-            avpicture_fill( &pv->pic_out, pv->buf_out[!(frame^1)]->data,
-                            pix_fmt, width, height );
-
-            yadif_filter( pv->pic_out.data, parity, tff, pv );
  
-            if( pv->mcdeint_mode >= 0 )
-            {
-                /* Perform mcdeint filtering */
-                avpicture_fill( &pv->pic_in,  pv->buf_out[(frame^1)]->data,
-                                pix_fmt, width, height );
-
-                mcdeint_filter( pv->pic_in.data, pv->pic_out.data, parity, pv );
-            }
+#if 0        
+        /* Perhaps skip the second run if the frame is uncombed? */
+        if( frame && !pv->yadif_arguments[0].is_combed )
+        {
+            break;
+        }
+#endif        
+        int parity = frame ^ tff ^ 1;
  
-            *buf_out = pv->buf_out[!(frame^1)];
+// This will be for bobbing
+#if 0
+        if( pv->alternator )
+        {
+            parity = !parity;
+            pv->alternator = 0;
          }
-    }
-    else if(  (pv->comb & 2 ) == 0 )
-    {
-        /* previous frame not interlaced - copy cached input frame to buf_out */
-        
-        pv->passed_frames++;
-        
-        avpicture_fill( &pv->pic_out,  pv->buf_out[0]->data, pix_fmt, width, height );
-        get_ref( (uint8_t**)pv->pic_out.data, pv, 1 );
-        *buf_out = pv->buf_out[0];
-    }
-    else
-    {
-        /* Perform yadif filtering */
-        
-        pv->deinterlaced_frames++;
-        int frame;
-        for( frame = 0; frame <= ( ( pv->mode == 2 || pv->mode == 5 )? 1 : 0 ) ; frame++ )
+        else
          {
-            int parity = frame ^ tff ^ 1;
-
-            avpicture_fill( &pv->pic_out, pv->buf_out[!(frame^1)]->data,
-                            pix_fmt, width, height );
-
-            yadif_filter( pv->pic_out.data, parity, tff, pv );
+            pv->alternator = 1;
+        }
+#endif
+        pv->tff = !parity;
  
-            if( pv->mcdeint_mode >= 0 )
-            {
-                /* Perform mcdeint filtering */
-                avpicture_fill( &pv->pic_in,  pv->buf_out[(frame^1)]->data,
-                                pix_fmt, width, height );
+        avpicture_fill( &pv->pic_out, pv->buf_out[!(frame^1)]->data,
+                        pix_fmt, width, height );
  
-                mcdeint_filter( pv->pic_in.data, pv->pic_out.data, parity, pv );
-            }
+        /* XXX
+            Should check here and only bother filtering field 2 when
+           field 1 was detected as combed.
+           And when it's not, it's a progressive frame,
+           so mcdeint should be skipped...
+        */
+        yadif_filter( pv->pic_out.data, parity, tff, pv );
+
+        /* Commented out code in the line below would skip mcdeint
+           on uncombed frames. Possibly a bad idea, since mcdeint
+           maintains the same snow context for the entire video... */
+        if( pv->mcdeint_mode >= 0 /* && pv->yadif_arguments[0].is_combed */)
+        {
+            /* Perform mcdeint filtering */
+            avpicture_fill( &pv->pic_in,  pv->buf_out[(frame^1)]->data,
+                            pix_fmt, width, height );
  
-            *buf_out = pv->buf_out[!(frame^1)];
+            mcdeint_filter( pv->pic_in.data, pv->pic_out.data, parity, pv );
          }
+
+        *buf_out = pv->buf_out[!(frame^1)];
      }
  
      /* Copy buffered settings to output buffer settings */