From 3370f1058dd8e744fb24598a52f6e33706e4f0b4 Mon Sep 17 00:00:00 2001 From: ConfusedPolarBear <33811686+ConfusedPolarBear@users.noreply.github.com> Date: Fri, 13 May 2022 01:13:13 -0500 Subject: [PATCH] Add second analysis pass --- .../TestAudioFingerprinting.cs | 18 +- .../Data/Intro.cs | 24 +- .../Data/SeasonHistogram.cs | 31 ++ .../ScheduledTasks/FingerprinterTask.cs | 277 +++++++++++++++--- 4 files changed, 294 insertions(+), 56 deletions(-) create mode 100644 ConfusedPolarBear.Plugin.IntroSkipper/Data/SeasonHistogram.cs diff --git a/ConfusedPolarBear.Plugin.IntroSkipper.Tests/TestAudioFingerprinting.cs b/ConfusedPolarBear.Plugin.IntroSkipper.Tests/TestAudioFingerprinting.cs index 63bca3e..eecf7ec 100644 --- a/ConfusedPolarBear.Plugin.IntroSkipper.Tests/TestAudioFingerprinting.cs +++ b/ConfusedPolarBear.Plugin.IntroSkipper.Tests/TestAudioFingerprinting.cs @@ -63,16 +63,18 @@ public class TestFPCalc var logger = new Logger(new LoggerFactory()); var task = new FingerprinterTask(logger); - var lhs = queueEpisode("audio/big_buck_bunny_intro.mp3"); - var rhs = queueEpisode("audio/big_buck_bunny_clip.mp3"); + var lhsEpisode = queueEpisode("audio/big_buck_bunny_intro.mp3"); + var rhsEpisode = queueEpisode("audio/big_buck_bunny_clip.mp3"); - var result = task.FingerprintEpisodes(lhs, rhs); - var actual = FingerprinterTask.LastIntro; + var (lhs, rhs) = task.FingerprintEpisodes(lhsEpisode, rhsEpisode); - Assert.True(result); - Assert.True(actual.Valid); - Assert.Equal(5.12, actual.IntroStart); - Assert.Equal(22.912, actual.IntroEnd); + Assert.True(lhs.Valid); + Assert.Equal(0, lhs.IntroStart); + Assert.Equal(17.792, lhs.IntroEnd); + + Assert.True(rhs.Valid); + Assert.Equal(5.12, rhs.IntroStart); + Assert.Equal(22.912, rhs.IntroEnd); } private QueuedEpisode queueEpisode(string path) diff --git a/ConfusedPolarBear.Plugin.IntroSkipper/Data/Intro.cs b/ConfusedPolarBear.Plugin.IntroSkipper/Data/Intro.cs index a806b17..714cfba 100644 --- a/ConfusedPolarBear.Plugin.IntroSkipper/Data/Intro.cs +++ b/ConfusedPolarBear.Plugin.IntroSkipper/Data/Intro.cs @@ -8,16 +8,36 @@ namespace ConfusedPolarBear.Plugin.IntroSkipper; /// public class Intro { + /// + /// Initializes a new instance of the class. + /// + /// Episode. + /// Intro start time. + /// Intro end time. + public Intro(Guid episode, double start, double end) + { + EpisodeId = episode; + IntroStart = start; + IntroEnd = end; + } + + /// + /// Initializes a new instance of the class. + /// + public Intro() + { + } + /// /// Gets or sets the Episode ID. /// public Guid EpisodeId { get; set; } /// - /// Gets or sets a value indicating whether this introduction is valid or not. + /// Gets a value indicating whether this introduction is valid or not. /// Invalid results must not be returned through the API. /// - public bool Valid { get; set; } + public bool Valid => IntroEnd > 0; /// /// Gets or sets the introduction sequence start time. diff --git a/ConfusedPolarBear.Plugin.IntroSkipper/Data/SeasonHistogram.cs b/ConfusedPolarBear.Plugin.IntroSkipper/Data/SeasonHistogram.cs new file mode 100644 index 0000000..832d62a --- /dev/null +++ b/ConfusedPolarBear.Plugin.IntroSkipper/Data/SeasonHistogram.cs @@ -0,0 +1,31 @@ +#pragma warning disable CA1815 + +using System; +using System.Collections.ObjectModel; + +namespace ConfusedPolarBear.Plugin.IntroSkipper; + +/// +/// Histogram entry for episodes in a season. +/// +public struct SeasonHistogram +{ + /// + /// Initializes a new instance of the struct. + /// + /// First episode seen with this duration. + public SeasonHistogram(Guid firstEpisode) + { + Episodes.Add(firstEpisode); + } + + /// + /// Gets episodes with this duration. + /// + public Collection Episodes { get; } = new Collection(); + + /// + /// Gets the number of times an episode with an intro of this duration has been seen. + /// + public int Count => Episodes?.Count ?? 0; +} diff --git a/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/FingerprinterTask.cs b/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/FingerprinterTask.cs index e49d5a7..abcbcad 100644 --- a/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/FingerprinterTask.cs +++ b/ConfusedPolarBear.Plugin.IntroSkipper/ScheduledTasks/FingerprinterTask.cs @@ -33,6 +33,16 @@ public class FingerprinterTask : IScheduledTask /// private const double SamplesToSeconds = 0.128; + /// + /// Bucket size used in the reanalysis histogram. + /// + private const int ReanalysisBucketWidth = 5; + + /// + /// Maximum time (in seconds) that an intro's duration can be different from a typical intro's duration before marking it for reanalysis. + /// + private const double ReanalysisTolerance = ReanalysisBucketWidth * 1.5; + private readonly ILogger _logger; /// @@ -44,11 +54,6 @@ public class FingerprinterTask : IScheduledTask _logger = logger; } - /// - /// Gets the last detected intro sequence. Only populated when a unit test is running. - /// - public static Intro LastIntro { get; private set; } = new Intro(); - /// /// Gets the task name. /// @@ -149,14 +154,18 @@ public class FingerprinterTask : IScheduledTask { _logger.LogDebug("Analyzing {LHS} and {RHS}", lhs.Path, rhs.Path); - if (FingerprintEpisodes(lhs, rhs)) - { - everFoundIntro = true; - } - else + var (lhsIntro, rhsIntro) = FingerprintEpisodes(lhs, rhs); + + Plugin.Instance.Intros![lhsIntro.EpisodeId] = lhsIntro; + Plugin.Instance.Intros![rhsIntro.EpisodeId] = rhsIntro; + + if (!lhsIntro.Valid) { failures += 2; + continue; } + + everFoundIntro = true; } catch (FingerprintException ex) { @@ -175,6 +184,14 @@ public class FingerprinterTask : IScheduledTask { break; } + + if (!everFoundIntro) + { + continue; + } + + // Reanalyze this season to check for (and hopefully correct) outliers and failed episodes. + CheckSeason(season.Value); } return Task.CompletedTask; @@ -185,12 +202,33 @@ public class FingerprinterTask : IScheduledTask /// /// First episode to analyze. /// Second episode to analyze. - /// true if an intro was found in both episodes, otherwise false. - public bool FingerprintEpisodes(QueuedEpisode lhsEpisode, QueuedEpisode rhsEpisode) + /// Intros for the first and second episodes. + public (Intro Lhs, Intro Rhs) FingerprintEpisodes(QueuedEpisode lhsEpisode, QueuedEpisode rhsEpisode) { - var lhs = FPCalc.Fingerprint(lhsEpisode); - var rhs = FPCalc.Fingerprint(rhsEpisode); + var lhsFingerprint = FPCalc.Fingerprint(lhsEpisode); + var rhsFingerprint = FPCalc.Fingerprint(rhsEpisode); + return FingerprintEpisodes( + lhsEpisode.EpisodeId, + lhsFingerprint, + rhsEpisode.EpisodeId, + rhsFingerprint); + } + + /// + /// Analyze two episodes to find an introduction sequence shared between them. + /// + /// First episode id. + /// First episode to analyze. + /// Second episode id. + /// Second episode to analyze. + /// Intros for the first and second episodes. + public (Intro Lhs, Intro Rhs) FingerprintEpisodes( + Guid lhsId, + ReadOnlyCollection lhs, + Guid rhsId, + ReadOnlyCollection rhs) + { var lhsRanges = new List(); var rhsRanges = new List(); @@ -219,16 +257,11 @@ public class FingerprinterTask : IScheduledTask if (lhsRanges.Count == 0) { _logger.LogDebug( - "Unable to find a shared introduction sequence between {LHS} and {RHS}", - lhsEpisode.Path, - rhsEpisode.Path); + "Unable to find a shared introduction sequence {LHS} and {RHS}", + lhsId, + rhsId); - // TODO: if an episode fails but others in the season succeed, reanalyze it against two that succeeded. - - StoreIntro(lhsEpisode.EpisodeId, 0, 0); - StoreIntro(rhsEpisode.EpisodeId, 0, 0); - - return false; + return (new Intro(lhsId, 0, 0), new Intro(rhsId, 0, 0)); } // After comparing both episodes at all possible shift positions, store the longest time range as the intro. @@ -249,10 +282,7 @@ public class FingerprinterTask : IScheduledTask rhsIntro.Start = 0; } - StoreIntro(lhsEpisode.EpisodeId, lhsIntro.Start, lhsIntro.End); - StoreIntro(rhsEpisode.EpisodeId, rhsIntro.Start, rhsIntro.End); - - return true; + return (new Intro(lhsId, lhsIntro.Start, lhsIntro.End), new Intro(rhsId, rhsIntro.Start, rhsIntro.End)); } /// @@ -366,25 +396,6 @@ public class FingerprinterTask : IScheduledTask return (lContiguous, rContiguous); } - private static void StoreIntro(Guid episode, double introStart, double introEnd) - { - var intro = new Intro() - { - EpisodeId = episode, - Valid = introEnd > 0, // don't test introStart here as the intro could legitimately happen at the start. - IntroStart = introStart, - IntroEnd = introEnd - }; - - if (Plugin.Instance is null) - { - LastIntro = intro; - return; - } - - Plugin.Instance.Intros[episode] = intro; - } - /// /// Count the number of bits that are set in the provided number. /// @@ -406,6 +417,180 @@ public class FingerprinterTask : IScheduledTask return count; } + /// + /// Reanalyze the most recently analyzed season. + /// Looks for and fixes intro durations that were either not found or are statistical outliers. + /// + /// List of episodes that was just analyzed. + private void CheckSeason(List episodes) + { + var intros = Plugin.Instance!.Intros; + + // First, assert that at least half of the episodes in this season have an intro. + var validCount = 0; + var totalCount = episodes.Count; + + foreach (var episode in episodes) + { + if (intros[episode.EpisodeId].Valid) + { + validCount++; + } + } + + var percentValid = (validCount * 100) / totalCount; + _logger.LogDebug("Found intros in {Valid}/{Total} ({Percent}%) of episodes", validCount, totalCount, percentValid); + if (percentValid < 50) + { + return; + } + + // Create a histogram of all episode durations + var histogram = new Dictionary(); + foreach (var episode in episodes) + { + var id = episode.EpisodeId; + var duration = GetIntroDuration(id); + + if (duration < MinimumIntroDuration) + { + continue; + } + + // Bucket the duration into equally sized groups + var bucket = Convert.ToInt32(Math.Floor(duration / ReanalysisBucketWidth)) * ReanalysisBucketWidth; + + // TryAdd returns true when the key was successfully added (i.e. for newly created buckets). + // Newly created buckets are initialized with the provided episode ID, so nothing else needs to be done for them. + if (histogram.TryAdd(bucket, new SeasonHistogram(id))) + { + continue; + } + + histogram[bucket].Episodes.Add(id); + } + + // Find the bucket that was seen most often, as this is likely to be the true intro length. + var maxDuration = 0; + var maxBucket = new SeasonHistogram(Guid.Empty); + foreach (var entry in histogram) + { + if (entry.Value.Count > maxBucket.Count) + { + maxDuration = entry.Key; + maxBucket = entry.Value; + } + } + + // Ensure that the most frequently seen bucket has a majority + // TODO: change to debug + percentValid = (maxBucket.Count * 100) / validCount; + _logger.LogInformation( + "Intro duration {Duration} appeared {Frequency} times ({Percent}%)", + maxDuration, + maxBucket.Count, + percentValid); + + if (percentValid < 50 || maxBucket.Episodes[0].Equals(Guid.Empty)) + { + return; + } + + _logger.LogInformation("Reanalyzing {Count} episodes", totalCount - maxBucket.Count); + + // TODO: pick two episodes at random + // Cache the fingerprint of the first episode in the max bucket to save CPU cycles + var lhs = episodes.Find(x => x.EpisodeId == maxBucket.Episodes[1]); + if (lhs is null) + { + _logger.LogError("Reanalysis failed to get episode from bucket"); + return; + } + + ReadOnlyCollection lhsFingerprint; + try + { + lhsFingerprint = FPCalc.Fingerprint(lhs); + } + catch (FingerprintException ex) + { + _logger.LogWarning("Skipping reanalysis of {Show} season {Season}: {Exception}", lhs.SeriesName, lhs.SeasonNumber, ex); + return; + } + + var lhsDuration = GetIntroDuration(lhs.EpisodeId); + var (lowTargetDuration, highTargetDuration) = ( + lhsDuration - ReanalysisTolerance, + lhsDuration + ReanalysisTolerance); + + foreach (var episode in episodes) + { + // Don't reanalyze episodes from the max bucket + if (maxBucket.Episodes.Contains(episode.EpisodeId)) + { + continue; + } + + var oldDuration = GetIntroDuration(episode.EpisodeId); + + // TODO: remove + var shortPath = episode.Path.Substring(episode.Path.Length - 40); + + // If the episode's intro duration is close enough to the targeted bucket, leave it alone. + if (Math.Abs(lhsDuration - oldDuration) <= ReanalysisTolerance) + { + _logger.LogInformation( + "Not reanalyzing episode {Path} (intro is {Initial}, target is {Max})", + shortPath, + Math.Round(oldDuration, 2), + maxDuration); + + continue; + } + + _logger.LogDebug( + "Reanalyzing episode {Path} (intro is {Initial}, target is {Max})", + shortPath, + Math.Round(oldDuration, 2), + maxDuration); + + // Analyze the episode again, ignoring whatever is returned for the known good episode. + var (_, newRhs) = FingerprintEpisodes( + lhs.EpisodeId, + lhsFingerprint, + episode.EpisodeId, + FPCalc.Fingerprint(episode)); + + // Ensure that the new intro duration is within the targeted bucket and longer than what was found previously. + var newDuration = Math.Round(newRhs.IntroEnd - newRhs.IntroStart, 2); + if (newDuration < oldDuration || newDuration < lowTargetDuration || newDuration > highTargetDuration) + { + _logger.LogInformation( + "Ignoring reanalysis for {Path} (was {Initial}, now is {New})", + shortPath, + oldDuration, + newDuration); + + continue; + } + + // TODO: change to debug + _logger.LogInformation( + "Reanalysis succeeded for {Path} (was {Initial}, now is {New})", + shortPath, + oldDuration, + newDuration); + + Plugin.Instance!.Intros[episode.EpisodeId] = newRhs; + } + } + + private double GetIntroDuration(Guid id) + { + var episode = Plugin.Instance!.Intros[id]; + return episode.Valid ? Math.Round(episode.IntroEnd - episode.IntroStart, 2) : 0; + } + /// /// Get task triggers. ///