Overall goal : To be part of People + AI community, mainly as a developer
Goal : To learn audio processing required to contribute to Sunva codebase
Research on material. Downloaded pdf from Datacamp Spoken Language Processing in Python - https://projector-video-pdf-converter.datacamp.com/17718/chapter3.pdf
from pydub import AudioSegment
from pydub.playback import play
from pydub.effects import normalize
from speech_recognition import Recognizer
import numpy as np
wav_file = AudioSegment.from_file(file=r"data\BAK.wav")
print(wav_file.channels) # 2 which means stereo
print(wav_file.frame_rate) # 44100 which means 44100 samples per second
print(wav_file.sample_width) #2 which means 2 bytes = 16 bits = 65536 levels
print(wav_file.max) # 14108 (relation between this and 65536)
print(len(wav_file)) # 8334
print(np.array(wav_file.get_array_of_samples()).shape) #(735076,)
# which implies (735076/2) / 44100 = 8.33 secs is audio duration
# divide by 2 because it is stereo, so 2 samples for each sampling instance
quiet_wav_file = wav_file - 60
loud_wav_file = wav_file + 10
play(loud_wav_file)
# recognizer = Recognizer()
# recognizer.recognize_google(quiet_wav_file)
# normalized_wav_file = normalize(loud_wav_file)
# loud_wav_file_channels = loud_wav_file.split_to_mono()
import numpy as np
a = np.array([1,2,3])
b = np.array([4,5,6])
np.hstack((a,b)) # stack the sequence of input arrays horizontally
# [1,2,3,4,5,6]
np.vstack((a,b))
# [[1 2 3], [4 5 6]]
a.reshape(-1,1)
# [[1],[2],[3]] # only 1 column, as many rows based on the dimension of the array
a.reshape(1,-1)
# [[1,2,3]]
a.reshape(-1)
# [1,2,3]
# shows machine limits for integer types.
np.iinfo(np.int16).max
Short Time Fourier Transform - normal fourier transform does not contain time information i.e. it shows frequency in the entire signal across all time and does not tell when a particular frequncy occurs. This is a challenge. To solve this, we have short time fourier transform i.e. stft, where signal is broken down into blocks of smaller duration (say 5 seconds), and FFT (fast fourier transform) is done on each block. This is then laid on a spectogram, with time on x axis, frequency on y axis and color indicating the magnitude of frequency (refer 1)
In STFT time and frequency resolution are inversely proportional. If we increase the window size/frame size, the frequency resolution increases because for the same frequency range (0 to fs/2), we get more frequency bins i.e. smaller frequency bins, so we can pin point more accurately which frequency the sound belongs to (e.g 0 to 5hz bin and 5 to 10hz bin instead of just a 0 to 10hz bin). But we lose out on time resolution as we have taken a larger time window, so it is harder to pin point when exactly that frequency occured. Similarly, when we reduce the window size/frame size, the time resolution increases, but the frequency resolution decreases.
# sampling rate
fs = 2000
# sampling interval
ts = 1.0/fs
# create array starting 0, ending just before 1, interval bw 2 values is ts
t = np.arange(0, 1, ts) #ends at 1 - 1/2000 = 0.9995
# input of sin function is in radians not degrees, 1 radian = 57.29 degrees, 2*pi radian = 360 degree
# freq and t together determine the value, since freq is fixed, different values of t give different values
# When t = 1/freq = 1, it finishes 1 cycle, all values of t in between are different points in that cycle
freq = 1
x = 3*np.sin(2*np.pi*freq*t)
# when t= 1/freq = 0.25, it finishes 1 cycle, all values of t in between (0.0005 to 0.2495) are different points in that cycle
freq = 4
x += np.sin(2*np.pi*freq*t)
freq = 7
x += 0.5* np.sin(2*np.pi*freq*t)
plt.figure(figsize = (8, 6))
plt.plot(t, x, 'r')
plt.ylabel('Amplitude')
plt.show()
from numpy.fft import fft, ifft, rfft
# Number of points in frequency domain is 2000, which is equal to number of points in time domain
X = fft(x)
# number of points in frequency domain, which is also
N = len(X)
n = np.arange(N)
# fs/N = 2000/2000 = 1, is the frequency bin size, first bin is 0 Hz, second bin is from 0Hz to 1Hz
freq = n * (fs / N)
# fft gives full whereas "rfft" (real FFT), you will get back the first half and the Nyquist bin for even N.
# Hence length of rfft is only 1001, the first 1000 points plus 1
Xr = rfft(x)
nr = np.arange(len(Xr))
# We have to add 2 in denominator as rfft only goes from 0 to fs/2 (fs/2 to fs is same as -fs/2 to
# 0, which for real valued signal is same as 0 to fs/2)
freqr = nr * (fs / (2 *len(Xr)))
plt.figure(figsize = (12, 6))
plt.subplot(121)
plt.plot(freq, np.abs(X))
plt.xlabel('Freq (Hz)')
plt.ylabel('FFT Amplitude |X(freq)|')
plt.subplot(122)
plt.figure(figsize = (12, 6))
plt.plot(freqr, np.abs(Xr))
plt.xlabel('Freq (Hz)')
plt.ylabel('FFT Amplitude |X(freq)|')
plt.tight_layout()
plt.show()
import numpy as np
import librosa
N = 48000
x = 10*np.ones(N) # np.ones((N,1)) will not work
S = np.abs(librosa.stft(x))
print(S.shape) # (1025, 94)
# S is basically time frequency domain
# n_fft = length of windowed signal after zero padding = fft length = 2048 (default value)
# hop_length = number of samples bw two successive windows in time domain = 512 (default value is win_length // 4)
# Number of rows in STFT matrix = number of frequncy bins = (n_fft/2) + 1 = 2048/2 + 1 = 1025
# Number of columns in STFT matrix = number of frames = N / hop_length = 48000 / 512 = 93.75
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(S,
ref=np.max),
y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
## refer 2 for similar examples
from pydub import AudioSegment
import numpy as np
import librosa
file = r'speech_with_silence.wav'
audio_segment = AudioSegment.from_file(file, format="wav")
audio_data = np.array(audio_segment.get_array_of_samples())
audio_data = audio_data.reshape((-1, audio_segment.channels))
audio_data = audio_data.mean(axis=1, dtype=np.int16)
audio_data = audio_data / np.iinfo(audio_data.dtype).max
S = np.abs(librosa.stft(audio_data))
Pyaudio : Python library to play and record audio. Basically provides python binding for PortAudio a cross platform audio i/o library
Code to convert speech to text using Groq ASR/STT api
from groq import Groq
client = Groq(api_key=groq_api_key)
def transcribe_audio(audio_file_path):
try:
# b stands for binary file, rb means read binary
# file parameter of Groq client needs filename and file content as input
# since its a binary file, output of file.read() will be a series of binary characters like
# \x95\xff\x96\00
with open(audio_file_path, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(audio_file_path, file.read()),
model="whisper-large-v3",
response_format="text",
language="en",
prompt="",
)
return transcription
except Exception as e:
print(str(e))
return None
# wav file with my voice recorded using Audacity
transcription = transcribe_audio('data/speech_with_silence.wav')
print(transcription)
import numpy as np
my_array = np.array([[1,2,3],[4,5,6]])
print(np.mean(my_array, axis=0)) #array([2.5, 3.5, 4.5])
print(np.mean(my_array, axis=1)) #array([2., 5.])
from pydub import AudioSegment
import numpy as np
import librosa
file = r'speech_with_silence.wav'
audio_segment = AudioSegment.from_file(file, format="wav")
audio_data = np.array(audio_segment.get_array_of_samples())
audio_data = audio_data.reshape((-1, audio_segment.channels))
audio_data = audio_data.mean(axis=1, dtype=np.int16)
audio_data = audio_data / np.iinfo(audio_data.dtype).max #normalize audio
S = np.abs(librosa.stft(audio_data))
frame_energies = np.mean(S, axis=0)
low_energy_frames = np.sum(np.mean(S, axis=0) < 0.02) # get number of frames with magnitude less than 0.02
# get percentage of frames with low energy out of all the frames
# if this percentage is very high say greater than 75%, then we can say that it is a silent sound
proportion_low_energy = low_energy_frames / len(frame_energies)
## np.sum(np.mean(S[:, 50:100], axis=0) > 0.02) # check energies of smaller chunks of frames
For full documentation visit mkdocs.org.
To run docs, within virtual environment use : mkdocs serve