From 4558b59e9578584e58fca98f090ef41ca0573327 Mon Sep 17 00:00:00 2001
From: ConfusedPolarBear <33811686+ConfusedPolarBear@users.noreply.github.com>
Date: Thu, 25 Aug 2022 01:28:41 -0500
Subject: [PATCH] Only use inverted indexes to search
---
.../Data/AnalysisStatistics.cs | 19 +-
.../ScheduledTasks/AnalyzeEpisodesTask.cs | 233 +-----------------
2 files changed, 4 insertions(+), 248 deletions(-)
diff --git a/ConfusedPolarBear.Plugin.IntroSkipper/Data/AnalysisStatistics.cs b/ConfusedPolarBear.Plugin.IntroSkipper/Data/AnalysisStatistics.cs
index d0f1e97..522dbdf 100644
--- a/ConfusedPolarBear.Plugin.IntroSkipper/Data/AnalysisStatistics.cs
+++ b/ConfusedPolarBear.Plugin.IntroSkipper/Data/AnalysisStatistics.cs
@@ -25,30 +25,15 @@ public class AnalysisStatistics
///
public ThreadSafeInteger IndexSearches { get; } = new ThreadSafeInteger();
- ///
- /// Gets the number of times a quick scan successfully located a pair of introductions.
- ///
- public ThreadSafeInteger QuickScans { get; } = new ThreadSafeInteger();
-
- ///
- /// Gets the number of times a full scan successfully located a pair of introductions.
- ///
- public ThreadSafeInteger FullScans { get; } = new ThreadSafeInteger();
-
///
/// Gets the total CPU time spent waiting for audio fingerprints to be generated.
///
public ThreadSafeInteger FingerprintCPUTime { get; } = new ThreadSafeInteger();
///
- /// Gets the total CPU time spent analyzing fingerprints in the initial pass.
+ /// Gets the total CPU time spent analyzing fingerprints.
///
- public ThreadSafeInteger FirstPassCPUTime { get; } = new ThreadSafeInteger();
-
- ///
- /// Gets the total CPU time spent analyzing fingerprints in the second pass.
- ///
- public ThreadSafeInteger SecondPassCPUTime { get; } = new ThreadSafeInteger();
+ public ThreadSafeInteger AnalysisCPUTime { get; } = new ThreadSafeInteger();
///
/// Gets the total task runtime across all threads.
diff --git a/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/AnalyzeEpisodesTask.cs b/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/AnalyzeEpisodesTask.cs
index 47033e4..0d4507f 100644
--- a/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/AnalyzeEpisodesTask.cs
+++ b/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/AnalyzeEpisodesTask.cs
@@ -30,16 +30,6 @@ public class AnalyzeEpisodesTask : IScheduledTask
///
private const double SamplesToSeconds = 0.128;
- ///
- /// Bucket size used in the reanalysis histogram.
- ///
- private const int ReanalysisBucketWidth = 5;
-
- ///
- /// Maximum time (in seconds) that an intro's duration can be different from a typical intro's duration before marking it for reanalysis.
- ///
- private const double ReanalysisTolerance = ReanalysisBucketWidth * 1.5;
-
private readonly ILogger _logger;
private readonly ILogger _queueLogger;
@@ -375,17 +365,6 @@ public class AnalyzeEpisodesTask : IScheduledTask
}
}
- // Only run the second pass if the user hasn't requested cancellation and we found an intro
- if (!cancellationToken.IsCancellationRequested && everFoundIntro)
- {
- var start = DateTime.Now;
-
- // Run a second pass over this season to remove outliers and fix episodes that failed in the first pass.
- RunSecondPass(season.Value);
-
- analysisStatistics.SecondPassCPUTime.AddDuration(start);
- }
-
lock (_introsLock)
{
Plugin.Instance!.SaveTimestamps();
@@ -441,7 +420,6 @@ public class AnalyzeEpisodesTask : IScheduledTask
// If this isn't running as part of the first analysis pass, don't count this CPU time as first pass time.
var start = isFirstPass ? DateTime.Now : DateTime.MinValue;
- // ===== Method 1: Inverted indexes =====
// Creates an inverted fingerprint point index for both episodes.
// For every point which is a 100% match, search for an introduction at that point.
var (lhsRanges, rhsRanges) = SearchInvertedIndex(lhsPoints, rhsPoints);
@@ -450,46 +428,17 @@ public class AnalyzeEpisodesTask : IScheduledTask
{
_logger.LogTrace("Index search successful");
analysisStatistics.IndexSearches.Increment();
- analysisStatistics.FirstPassCPUTime.AddDuration(start);
+ analysisStatistics.AnalysisCPUTime.AddDuration(start);
return GetLongestTimeRange(lhsId, lhsRanges, rhsId, rhsRanges);
}
- // ===== Method 2: Quick scan =====
- // Tests if an intro can be found within the first 5 seconds of the episodes. ±5/0.128 = ±40 samples.
- (lhsRanges, rhsRanges) = ShiftEpisodes(lhsPoints, rhsPoints, -40, 40);
-
- if (lhsRanges.Count > 0)
- {
- _logger.LogTrace("Quick scan successful");
- analysisStatistics.QuickScans.Increment();
- analysisStatistics.FirstPassCPUTime.AddDuration(start);
-
- return GetLongestTimeRange(lhsId, lhsRanges, rhsId, rhsRanges);
- }
-
- // ===== Method 3: Full scan =====
- // Compares all elements of the shortest fingerprint to the other fingerprint.
- var limit = Math.Min(lhsPoints.Length, rhsPoints.Length);
- (lhsRanges, rhsRanges) = ShiftEpisodes(lhsPoints, rhsPoints, -1 * limit, limit);
-
- if (lhsRanges.Count > 0)
- {
- _logger.LogTrace("Full scan successful");
- analysisStatistics.FullScans.Increment();
- analysisStatistics.FirstPassCPUTime.AddDuration(start);
-
- return GetLongestTimeRange(lhsId, lhsRanges, rhsId, rhsRanges);
- }
-
- // No method was able to find an introduction, return nothing.
-
_logger.LogTrace(
"Unable to find a shared introduction sequence between {LHS} and {RHS}",
lhsId,
rhsId);
- analysisStatistics.FirstPassCPUTime.AddDuration(start);
+ analysisStatistics.AnalysisCPUTime.AddDuration(start);
return (new Intro(lhsId), new Intro(rhsId));
}
@@ -694,184 +643,6 @@ public class AnalyzeEpisodesTask : IScheduledTask
return BitOperations.PopCount(number);
}
- ///
- /// Reanalyze the most recently analyzed season.
- /// Looks for and fixes intro durations that were either not found or are statistical outliers.
- ///
- /// List of episodes that was just analyzed.
- private void RunSecondPass(List episodes)
- {
- // First, assert that at least half of the episodes in this season have an intro.
- var validCount = 0;
- var totalCount = episodes.Count;
-
- foreach (var episode in episodes)
- {
- if (Plugin.Instance!.Intros.TryGetValue(episode.EpisodeId, out var intro) && intro.Valid)
- {
- validCount++;
- }
- }
-
- var percentValid = (validCount * 100) / totalCount;
- _logger.LogTrace("Found intros in {Valid}/{Total} ({Percent}%) of episodes", validCount, totalCount, percentValid);
- if (percentValid < 50)
- {
- return;
- }
-
- // Create a histogram of all episode durations
- var histogram = new Dictionary();
- foreach (var episode in episodes)
- {
- var id = episode.EpisodeId;
- var duration = GetIntroDuration(id);
-
- if (duration < minimumIntroDuration)
- {
- continue;
- }
-
- // Bucket the duration into equally sized groups
- var bucket = Convert.ToInt32(Math.Floor(duration / ReanalysisBucketWidth)) * ReanalysisBucketWidth;
-
- // TryAdd returns true when the key was successfully added (i.e. for newly created buckets).
- // Newly created buckets are initialized with the provided episode ID, so nothing else needs to be done for them.
- if (histogram.TryAdd(bucket, new SeasonHistogram(id)))
- {
- continue;
- }
-
- histogram[bucket].Episodes.Add(id);
- }
-
- // Find the bucket that was seen most often, as this is likely to be the true intro length.
- var maxDuration = 0;
- var maxBucket = new SeasonHistogram(Guid.Empty);
- foreach (var entry in histogram)
- {
- if (entry.Value.Count > maxBucket.Count)
- {
- maxDuration = entry.Key;
- maxBucket = entry.Value;
- }
- }
-
- // Ensure that the most frequently seen bucket has a majority
- percentValid = (maxBucket.Count * 100) / validCount;
- _logger.LogTrace(
- "Intro duration {Duration} appeared {Frequency} times ({Percent}%)",
- maxDuration,
- maxBucket.Count,
- percentValid);
-
- if (percentValid < 50 || maxBucket.Episodes[0].Equals(Guid.Empty))
- {
- return;
- }
-
- _logger.LogTrace("Second pass is processing {Count} episodes", totalCount - maxBucket.Count);
-
- // Calculate a range of intro durations that are most likely to be correct.
- var maxEpisode = episodes.Find(x => x.EpisodeId == maxBucket.Episodes[0]);
- if (maxEpisode is null)
- {
- _logger.LogError("Second pass failed to get episode from bucket");
- return;
- }
-
- var lhsDuration = GetIntroDuration(maxEpisode.EpisodeId);
- var (lowTargetDuration, highTargetDuration) = (
- lhsDuration - ReanalysisTolerance,
- lhsDuration + ReanalysisTolerance);
-
- // TODO: add limit and make it customizable
- var count = maxBucket.Episodes.Count - 1;
- var goodFingerprints = new List();
- foreach (var id in maxBucket.Episodes)
- {
- if (!_fingerprintCache.TryGetValue(id, out var fp))
- {
- _logger.LogTrace("Second pass: max bucket episode {Id} not in cache, skipping", id);
- continue;
- }
-
- goodFingerprints.Add(fp);
- }
-
- foreach (var episode in episodes)
- {
- // Don't reanalyze episodes from the max bucket
- if (maxBucket.Episodes.Contains(episode.EpisodeId))
- {
- continue;
- }
-
- var oldDuration = GetIntroDuration(episode.EpisodeId);
-
- // If the episode's intro duration is close enough to the targeted bucket, leave it alone.
- if (Math.Abs(lhsDuration - oldDuration) <= ReanalysisTolerance)
- {
- _logger.LogTrace(
- "Not reanalyzing episode {Path} (intro is {Initial}, target is {Max})",
- episode.Path,
- Math.Round(oldDuration, 2),
- maxDuration);
-
- continue;
- }
-
- _logger.LogTrace(
- "Reanalyzing episode {Path} (intro is {Initial}, target is {Max})",
- episode.Path,
- Math.Round(oldDuration, 2),
- maxDuration);
-
- // Analyze the episode again, ignoring whatever is returned for the known good episode.
- foreach (var lhsFingerprint in goodFingerprints)
- {
- if (!_fingerprintCache.TryGetValue(episode.EpisodeId, out var fp))
- {
- _logger.LogTrace("Unable to get cached fingerprint for {Id}, skipping", episode.EpisodeId);
- continue;
- }
-
- var (_, newRhs) = FingerprintEpisodes(
- maxEpisode.EpisodeId,
- lhsFingerprint,
- episode.EpisodeId,
- fp,
- false);
-
- // Ensure that the new intro duration is within the targeted bucket and longer than what was found previously.
- var newDuration = Math.Round(newRhs.IntroEnd - newRhs.IntroStart, 2);
- if (newDuration < oldDuration || newDuration < lowTargetDuration || newDuration > highTargetDuration)
- {
- _logger.LogTrace(
- "Ignoring reanalysis for {Path} (was {Initial}, now is {New})",
- episode.Path,
- oldDuration,
- newDuration);
-
- continue;
- }
-
- _logger.LogTrace(
- "Reanalysis succeeded for {Path} (was {Initial}, now is {New})",
- episode.Path,
- oldDuration,
- newDuration);
-
- lock (_introsLock)
- {
- Plugin.Instance!.Intros[episode.EpisodeId] = newRhs;
- }
-
- break;
- }
- }
- }
-
private double GetIntroDuration(Guid id)
{
if (!Plugin.Instance!.Intros.TryGetValue(id, out var episode))