Files
Prageeth Monasha 48a28457e4 [main] voice added;
2026-02-22 14:21:56 +01:00

545 lines
20 KiB
C#

// To enable SoundTouch library tempo change for audio frame shrinking when catching up:
// Define PHOTON_VOICE_SOUND_TOUCH_ENABLE
// Set PlayDelayConfig.TempoChangeHQ to true
// Add SoundTouch library https://gitlab.com/soundtouch/soundtouch
// Android: edit /Android-lib/jni/Android.mk:
// add ../../SoundTouchDLL/SoundTouchDLL.cpp to sources list
// add LOCAL_CFLAGS += -DDLL_EXPORTS
// Windows: http://soundtouch.surina.net/download.html
// Add SoundTouch library C# wrapper https://gitlab.com/soundtouch/soundtouch/-/blob/master/source/csharp-example/SoundTouch.cs
// Replace "SoundTouch.dll" with "soundtouch" in SoundTouch.cs
using System;
using System.Collections.Concurrent;
namespace Photon.Voice
{
public interface IAudioOut<T>
{
bool IsPlaying { get; }
void Start(int frequency, int channels, int frameSamplesPerChannel);
void Flush();
void Stop();
void Push(T[] frame);
void Service();
int Lag { get; } // ms
}
public class AudioOutDummy<T> : IAudioOut<T>
{
public bool IsPlaying => false;
public int Lag => 0;
public void Flush()
{
}
public void Push(T[] frame)
{
}
public void Service()
{
}
public void Start(int frequency, int channels, int frameSamplesPerChannel)
{
}
public void Stop()
{
}
}
public class AudioOutDelayControl
{
[Serializable]
public struct PlayDelayConfig
{
static public PlayDelayConfig Default = new PlayDelayConfig()
{
Low = 200, // should work for fairly high rtt variance
High = 200, // rely on automatic tolerance value
Max = 1000, // drop frames if the delay is 1 sec. or higher
SpeedUpPerc = 5,
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
TempoChangeHQ = false,
#endif
};
public int Low; // ms: Audio player initilizes the delay with this value on Start and after flush and targets it during corrections
public int High; // ms: Audio player tries to keep the delay below this value
public int Max; // ms: Audio player guarantees that the delay never exceeds this value by dropping frames
public int Delay // ms: simplified delay API
{
get => Low;
set
{
Low = value;
High = value; // rely on automatic tolerance value
Max = Default.Max;
}
}
public int SpeedUpPerc; // playback speed-up to catch up the stream
}
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
public bool TempoChangeHQ;
#endif
}
// Consumes audio frames via Push(), optionally resizes and writes (OutWrite) them to the output to keep constant delay
// between output playback position (OutPos) and input stream position (advanced with each write).
// Assumes output is always playing.
public abstract class AudioOutDelayControl<T> : AudioOutDelayControl, IAudioOut<T>
{
readonly protected int sizeofT = System.Runtime.InteropServices.Marshal.SizeOf(default(T));
// methods to implement
// returns playback position in samples, either absolute or in a ring buffer
abstract public long OutPos { get; }
abstract public void OutCreate(int frequency, int channels, int bufferSamples);
abstract public void OutStart();
// offsetSamples is an offset in the ring buffer
abstract public void OutWrite(T[] data, int offsetSamples);
const int TEMPO_UP_SKIP_GROUP = 6;
private int frameSamples;
private int frameSize;
private int bufferSamples;
private int bufferSamplesHalf;
private int frequency;
// stream positions in samples modulo bufferSamples
private int writeSamplePos; // updated and read only in processFrame() (called from Service() or Push())
private int clearSamplePos; // updated only in Service()
private PlayDelayConfig playDelayConfig;
private int channels;
private bool started;
private bool flushed = true;
private int targetDelaySamples;
private int upperTargetDelaySamples; // correct if the delay is higher: gradually move to the target using input frames resampling
private int maxDelaySamples; // skip input samples if the delay is higher to always keep it below
private const int NO_PUSH_TIMEOUT_MS = 120; // should be greater than Push() call interval, 60 ms max packet length + some jitter
int lastPushTime = Environment.TickCount - NO_PUSH_TIMEOUT_MS;
protected readonly ILogger logger;
protected readonly string logPrefix;
private readonly bool debugInfo;
readonly bool processInService = false; // enqueue frame in Push() to process it in Service(), otherwise process directly in Push()
private T[] zeroFrame;
// Used in a hack allowing to distinguish between regular and zeroing OutWrite() calls in inherited classes
public bool IsZeroFrame(T[] f)
{
return zeroFrame == f;
}
private T[] resampledFrame;
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
soundtouch.SoundTouch st;
#endif
AudioUtil.TempoUp<T> tempoUp;
bool tempoChangeHQ; // true if library is available
public AudioOutDelayControl(bool processInService, PlayDelayConfig playDelayConfig, ILogger logger, string logPrefix, bool debugInfo)
{
this.processInService = processInService;
this.playDelayConfig = playDelayConfig;
this.logger = logger;
this.logPrefix = logPrefix;
this.debugInfo = debugInfo;
}
public int Lag
{
get
{
if (this.started)
{
int d = this.writeSamplePos - (int)(OutPos % this.bufferSamples);
// detect ring buffer wrapping and convert to ms
return (d > bufferSamplesHalf ? d - bufferSamples : d < -bufferSamplesHalf ? d + bufferSamples : d) *1000 / frequency;
}
else
{
return 0;
}
}
}
public bool IsFlushed
{
get { return !started || this.flushed; }
}
public bool IsPlaying
{
get { return !IsFlushed && (Environment.TickCount - lastPushTime < NO_PUSH_TIMEOUT_MS); }
}
public void Start(int frequency, int channels, int frameSamples)
{
//frequency = (int)(frequency * 1.2); // underrun test
//frequency = (int)(frequency / 1.2); // overrun test
this.frequency = frequency;
this.channels = channels;
// adjust play delay tolerances so that they do not switch the state until playSamplePos is updated in Service()
// assuming Service() is called at least 30 times per sec. plus some threshold
int playSamplePosUpdateInterval = frequency / 20;
this.targetDelaySamples = playDelayConfig.Low * frequency / 1000;
// add a tolerance to make sure that we have something to play when the delay set to 0
if (this.targetDelaySamples < playSamplePosUpdateInterval)
{
this.targetDelaySamples = playSamplePosUpdateInterval;
}
// make sure that the tolerance is not less than the user set
this.upperTargetDelaySamples = this.targetDelaySamples + (playDelayConfig.High - playDelayConfig.Low) * frequency / 1000;
// add some tolerance in case it set to 0 by user
if (this.upperTargetDelaySamples < targetDelaySamples + playSamplePosUpdateInterval)
{
this.upperTargetDelaySamples = targetDelaySamples + playSamplePosUpdateInterval;
}
// make sure that the tolerance is not less than the user set
// if playDelayConfig.High <= playDelayConfig.Max, the catching-up is disabled
this.maxDelaySamples = this.upperTargetDelaySamples + (playDelayConfig.Max - playDelayConfig.High) * frequency / 1000;
this.bufferSamples = 4 * this.maxDelaySamples; // make sure we have enough space to detect wrapped positions order in the buffer
if (this.bufferSamples < 2 * frequency)
{
this.bufferSamples = 2 * frequency; // at least 2 sec.
}
this.bufferSamplesHalf = bufferSamples / 2;
this.frameSamples = frameSamples;
this.frameSize = frameSamples * channels;
this.writeSamplePos = this.targetDelaySamples;
if (this.framePool == null || this.framePool.Info != this.frameSize)
{
this.framePool = new ArrayPool<T>(FRAME_POOL_CAPACITY, "AudioOutDelayControl", this.frameSize);
}
this.zeroFrame = new T[this.frameSize];
this.resampledFrame = new T[this.frameSize];
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
if (this.playDelayConfig.TempoChangeHQ)
{
try
{
st = new soundtouch.SoundTouch();
st.Channels = (uint)channels;
st.SampleRate = (uint)frequency;
tempoChangeHQ = true;
}
catch (DllNotFoundException e)
{
logger.Log(LogLevel.Error, "{0} SoundTouch library not found, disabling HQ tempo mode: {1}", this.logPrefix, e);
tempoChangeHQ = false;
}
}
#else
tempoChangeHQ = false;
#endif
if (!tempoChangeHQ)
{
tempoUp = new AudioUtil.TempoUp<T>();
}
OutCreate(frequency, channels, bufferSamples);
OutStart();
this.started = true;
this.logger.Log(LogLevel.Info, "{0} Start: {1} bs={2} ch={3} f={4} tds={5} utds={6} mds={7} speed={8} tempo={9}", this.logPrefix, sizeofT == 2 ? "short" : "float", bufferSamples, channels, frequency, targetDelaySamples, upperTargetDelaySamples, maxDelaySamples, playDelayConfig.SpeedUpPerc, tempoChangeHQ ? "HQ" : "LQ");
}
ConcurrentQueue<T[]> frameQueue = new ConcurrentQueue<T[]>();
public const int FRAME_POOL_CAPACITY = 50;
ArrayPool<T> framePool;
bool catchingUp = false;
void processFrame(T[] frame)
{
int playSamplePos = (int)(OutPos % this.bufferSamples);
int d = this.writeSamplePos - playSamplePos;
// detect ring buffer wrapping
int lagSamples = d > bufferSamplesHalf ? d - bufferSamples : d < -bufferSamplesHalf ? d + bufferSamples : d;
if (!this.flushed)
{
if (lagSamples > maxDelaySamples)
{
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} overrun {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
this.writeSamplePos = (playSamplePos + maxDelaySamples) % this.bufferSamples;
lagSamples = maxDelaySamples;
}
else if (lagSamples < 0)
{
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} underrun {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
this.writeSamplePos = (playSamplePos + targetDelaySamples) % this.bufferSamples;
lagSamples = targetDelaySamples;
}
}
if (frame == null) // flush signalled
{
this.flushed = true;
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} stream flush pause {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
if (catchingUp)
{
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
if (tempoChangeHQ)
{
st.Flush();
writeTempoHQ();
}
#endif
catchingUp = false;
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} stream sync reset {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
}
return;
}
else
{
if (this.flushed)
{
this.writeSamplePos = (playSamplePos + targetDelaySamples) % this.bufferSamples;
lagSamples = targetDelaySamples;
this.flushed = false;
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} stream unpause {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
}
}
// starting catching up
if (lagSamples > upperTargetDelaySamples && !catchingUp)
{
if (!tempoChangeHQ)
{
tempoUp.Begin(channels, playDelayConfig.SpeedUpPerc, TEMPO_UP_SKIP_GROUP);
}
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
else
{
st.Clear();
var tempo = (float)(100 + playDelayConfig.SpeedUpPerc) / 100;
st.Tempo = tempo;
}
#endif
catchingUp = true;
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} stream sync started {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
}
// finishing catching up
bool frameIsWritten = false; // first frame after switching from catching up requires special processing to flush TempoUp (the end of skipping wave removed if required)
if (lagSamples <= targetDelaySamples && catchingUp)
{
if (!tempoChangeHQ)
{
int skipSamples = tempoUp.End(frame);
int resampledLenSamples = frame.Length / channels - skipSamples;
Buffer.BlockCopy(frame, skipSamples * channels * sizeofT, resampledFrame, 0, resampledLenSamples * channels * sizeofT);
writeResampled(resampledFrame, resampledLenSamples);
frameIsWritten = true;
}
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
else
{
st.Flush();
writeTempoHQ();
st.Clear();
}
#endif
catchingUp = false;
if (this.debugInfo)
{
logger.Log(LogLevel.Debug, "{0} stream sync finished {1} {2} {3} {4} {5}", this.logPrefix, upperTargetDelaySamples, lagSamples, playSamplePos, this.writeSamplePos, playSamplePos + targetDelaySamples);
}
}
if (frameIsWritten)
{
return;
}
if (catchingUp)
{
if (!tempoChangeHQ)
{
int resampledLenSamples = tempoUp.Process(frame, resampledFrame);
writeResampled(resampledFrame, resampledLenSamples);
}
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
else
{
if (sizeofT == 2)
{
st.PutSamplesI16(frame as short[], (uint)(frame.Length / channels));
}
else
{
st.PutSamples(frame as float[], (uint)(frame.Length / channels));
}
lagSamples -= writeTempoHQ();
}
#endif
}
else
{
OutWrite(frame, this.writeSamplePos);
this.writeSamplePos = (this.writeSamplePos + frame.Length / this.channels) % this.bufferSamples;
}
}
// should be called in Update thread
public void Service()
{
if (this.started)
{
if (processInService)
{
while (frameQueue.TryDequeue(out T[] frame))
{
processFrame(frame);
if (frame == null)
{
break; // flush signalled
}
framePool.Free(frame, frame.Length);
}
}
int playSamplePos = (int)(OutPos % this.bufferSamples);
if (this.clearSamplePos > playSamplePos)
{
this.clearSamplePos -= this.bufferSamples;
}
// clear played back buffer segment
for (; this.clearSamplePos + this.frameSamples < playSamplePos; this.clearSamplePos += this.frameSamples)
{
int o = (int)(this.clearSamplePos % this.bufferSamples);
if (o < 0) o += this.bufferSamples;
OutWrite(this.zeroFrame, o);
}
}
}
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
int writeTempoHQ()
{
int resampledLenSamples;
if (sizeofT == 2)
{
resampledLenSamples = (int)st.ReceiveSamplesI16(resampledFrame as short[], (uint)(resampledFrame.Length / channels));
}
else
{
resampledLenSamples = (int)st.ReceiveSamples(resampledFrame as float[], (uint)(resampledFrame.Length / channels));
}
return writeResampled(resampledFrame, resampledLenSamples);
}
#endif
int writeResampled(T[] f, int resampledLenSamples)
{
// zero not used part of the buffer because OutWrite() writes entire frame
// if this frame is the last, grabage may be played back
int tailSize = (f.Length - resampledLenSamples * channels) * sizeofT;
if (tailSize > 0) // it may be 0 what BlockCopy does not like
{
Buffer.BlockCopy(this.zeroFrame, 0, f, resampledLenSamples * channels * sizeofT, tailSize);
}
OutWrite(f, this.writeSamplePos);
this.writeSamplePos = (this.writeSamplePos + resampledLenSamples) % this.bufferSamples;
return resampledLenSamples;
}
// may be called on any thread
public void Push(T[] frame)
{
if (!this.started)
{
return;
}
if (frame.Length == 0)
{
return;
}
if (frame.Length != this.frameSize)
{
logger.Log(LogLevel.Error, "{0} audio frames are not of size: {1} != {2}", this.logPrefix, frame.Length, this.frameSize);
return;
}
if (processInService)
{
T[] b = framePool.New();
Buffer.BlockCopy(frame, 0, b, 0, frame.Length * sizeofT);
this.frameQueue.Enqueue(b);
}
else
{
processFrame(frame);
}
lastPushTime = Environment.TickCount;
}
public void Flush()
{
if (processInService)
{
this.frameQueue.Enqueue(null);
}
else
{
processFrame(null);
}
}
virtual public void Stop()
{
#if PHOTON_VOICE_SOUND_TOUCH_ENABLE
if (st != null)
{
st.Dispose();
st = null;
}
#endif
this.started = false;
}
}
}