Extracting a text from very large wav file using speech recognition

Extracting a text from very large wav file using speech recognition - python-3.x

I am having very large wav file ( approx 1 hour audio approx 700 mb) and I want to extract text from this file using python
my code is
from datetime import datetime
start_time = datetime.now()
print("Start Time : " + str(start_time))
import speech_recognition as sr
r = sr.Recognizer()
with sr.AudioFile("audio_chunk_1.wav") as source:
audio = r.record(source)
try:
s = r.recognize_google(audio)
print("Text: "+s)
except Exception as e:
print("Exception: "+str(e))
end_time = datetime.now()
print("End Time : " + str(end_time))
print('Duration: {}'.format(end_time - start_time))
it was giving me an error on my laptop so decided to split files
from pydub import AudioSegment
audio_file= "demo.wav"
audio = AudioSegment.from_wav(audio_file)
audio_chunk=audio[0:300000]
audio_chunk.export( "audio_chunk_{}.wav".format(1), format="wav")
audio_chunk=audio[300001:400000]
audio_chunk.export( "audio_chunk_{}.wav".format(2), format="wav")
audio_chunk=audio[400001:500000]
audio_chunk.export( "audio_chunk_{}.wav".format(3), format="wav")
I have to split wav files randomly to extract text. Can you pl suggest a better way to do it ?
Thanks

Related

cv2 wait key (Hoping to make it variable depending on latency)

When using my code bellow (It turns YouTube videos into ASCII with audio) The latency between the audio and video grows bigger each frame (I have tried many different wait times) I was wondering if there is a way to change the code to make it so the wait key changes depending on how much latency there is. I have only been coding for 6 months so sorry if there is any bad code.
import pytube
import os
import cv2
import PIL.Image
import winsound
from moviepy.editor import *
from pydub import AudioSegment
import threading
import time
##################################################
# downloads the youtube video and lets you input the path for where it should be saved
url = input ("Enter the you youtube url: \n\n")
path = input ("Enter the path where you want the youtube video to be saved: \n\n")
try:
youtube = pytube.YouTube(url)
streams = youtube.streams.all()
video = youtube.streams.get_highest_resolution()
video.download(path)
print ("Done!")
except:
print ("\nYoutube video has coppy righted material so it can not be downloaded. Try again with a different video")
##################################################
#locates all the files with the file extension .mp4
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".mp4"):
file_name = (file)
file_path = (os.path.join(root,file))
print (file_name)
print (file_path)
##################################################
mp3_file = (path+"\\")
mp3_file = (mp3_file+"audio.mp3")
mp4_file = (path+"\\")
mp4_file = (mp4_file+file_name)
VideoClip = VideoFileClip(mp4_file)
audioclip = VideoClip.audio
audioclip.write_audiofile(mp3_file)
audioclip.close()
VideoClip.close()
sound = AudioSegment.from_mp3(mp3_file)
sound.export(path+"/audio.wav", format = "wav")
##################################################
def a():
# Ascii characters used to create the output
ASCII_CHARS = ["#", "#", "S", "%", "?", "*", "+", ";", ":", ",", "."]
def resized_gray_image(image ,new_width=80):
width,height = image.size
aspect_ratio = height/width
new_height = int(aspect_ratio * new_width)
resized_gray_image = image.resize((new_width,new_height)).convert('L')
return resized_gray_image
def pix2chars(image):
pixels = image.getdata()
characters = "".join([ASCII_CHARS[pixel//25] for pixel in pixels])
return characters
def generate_frame(image,new_width=80):
new_image_data = pix2chars(resized_gray_image(image))
total_pixels = len(new_image_data)
ascii_image = "\n".join([new_image_data[index:(index+new_width)] for index in range(0, total_pixels, new_width)])
sys.stdout.write(ascii_image)
os.system('cls' if os.name == 'nt' else 'clear')
cap = cv2.VideoCapture(mp4_file)
print (cap)
try:
while True:
ret,frame = cap.read()
cv2.imshow("frame",frame)
generate_frame(PIL.Image.fromarray(frame))
cv2.waitKey(1)
except:
threading.Thread(target=c).start()
##################################################
def b():
winsound.PlaySound(path+"/audio.wav",winsound.SND_FILENAME)
##################################################
def c ():
os.remove (mp3_file)
os.remove (mp4_file)
os.remove (path+"/audio.wav")
threading.Thread(target=a).start()
threading.Thread(target=b).start()

why speech_recognition is no longer working?

I used the following code a couple of days ago and it was working fine, but now it does not recognize any of the audio files it used to recognize before. I am wondering what is wrong?
import speech_recognition as sr
r = sr.Recognizer()
audio_file_name = 'audio.wav'
audiofile = sr.AudioFile(audio_file_name)
with audiofile as source:
audio = r.record(source)
try:
text = r.recognize_google(audio)
print("i: {} You said : {}".format(i+1, text))
except:
print("Sorry could not recognize what you said")

Audio to text is slow and words are getting dropped

I have a code which takes videos from an input folder, converts it into audio file(.wav) using ffmpeg.
It then converts the audio file to text by recording 30 seconds audio (dura=30) and converting it to text using google translate api.
The problem is that the code takes a lot of time to convert video to text and it drops first two words and some words after every 30 seconds.
import speech_recognition as sr
import sys
import shutil
from googletrans import Translator
from pathlib import Path
import os
import wave
def audio_to_text(self,video_lst,deploy_path,video_path,audio_path):
try:
txt_lst=[]
for video_file in video_lst:
file_part=video_file.split('.')
audio_path_mod = audio_path +'/'+ '.'.join(file_part[:-1])
dir_path=video_path+'.'.join(file_part[:-1])
self.createDirectory(audio_path_mod)
audio_file='.'.join(file_part[:-1])+'.wav'
command_ffmpeg='set PATH=%PATH%;'+deploy_path.replace('config','script')+'audio_video/ffmpeg/bin/'
command='ffmpeg -i '+video_path+'/'+video_file+' '+audio_path_mod+'/'+audio_file
os.system(command_ffmpeg)
os.system(command)
r=sr.Recognizer()
dura=30
lang='en'
wav_filename=audio_path_mod+'/'+audio_file
f = wave.open(wav_filename, 'r')
frames = f.getnframes()
rate = f.getframerate()
audio_duration = frames / float(rate)
final_text_lst=[]
counter=0
with sr.AudioFile(wav_filename) as source:
while counter<audio_duration:
audio=r.record(source,duration=dura)
counter+=dura
try:
str=r.recognize_google(audio)
final_text_lst.append(str)
except Exception as e:
print(e)
print('Text data generated..')
text_path=audio_path_mod+'/'+audio_file.replace('.wav','_audio_text.csv')
with open(text_path, 'w') as f:
f.write(' '.join(final_text_lst))
except Exception as e:
print(e)
Any help/suggestion would be valuable. Thanks in advance.

Python 3.7 text-to-speech only playing one audio file, sometimes no audio at all, using IDLE editor

I am using the IDLE editor and Python 3.7, and I would like to know why my code is not playing multiple audio files (sequentially) and sometimes not playing audio at all:
import re
import wave
import pyaudio
import _thread
import time
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict:str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ',2)
self._l[key] = re.findall(r"[A-Z]+",val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+",str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
delay=0
for pron in list_pron:
_thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
delay += 0.145
def _play_audio(sound, delay):
try:
time.sleep(delay)
wf = wave.open("sounds/"+sound+".wav", 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
return
except:
pass
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I have a list of audio files that will play in a certain order, depending on what word I type in, when running the code. The code has no errors, but when I run it, when I type in a word, it only plays the first audio file needed (Example: When I type in "buy" it requires these two sounds: "b" and "ie" played together), but it only plays the first sound, "b", and sometimes no sound at all.
Why isn't it playing multiple audio files? I know that lots of people have been having this issue, but haven't been able to solve it.
Thank you for your help in advance, it is greatly appreciated :)

How to create a compressed archive of entire storage unit in Python?

I want to create a compressed backup of my eMMC (which has multiple primary partitions) using a Python3 script. I want to mimic the behaviour of piping the output of dd to bzip2 or xz. I have tried the two following approaches:
Incremental Compressor
import bz2
import time
import binascii
compressor = bz2.BZ2Compressor()
with open('/dev/mmcblk0', 'rb') as src_handle:
with open('/media/usb0/backup_emmc.bz2', 'wb') as dest_handle:
last_check = time.time()
read = 0
written = 0
chunksize_bytes = 10485760 # 10 MB
while True:
input = src_handle.read(chunksize_bytes)
if not input:
print("Flushing remaining data")
remaining_data = compressor.flush()
dest_handle.write(binascii.hexlify(remaining_data))
print("Copying complete")
break
else:
written += dest_handle.write(binascii.hexlify(compressor.compress(input)))
if time.time() - last_check >= 1:
last_check = time.time()
print("Read {:.2f} MB".format(read / (1024.0**2)))
print("Written {:.2f} MB".format(written / (1024.0**2)))
read += chunksize_bytes
Writing to Compressed File
import bz2
import shutil
with open('/dev/mmcblk0', 'rb') as src_handle:
with bz2.BZ2File('/media/usb0/backup_emmc.bz2', 'wb', compresslevel=9) as dest_handle:
shutil.copyfileobj(src_handle, dest_handle)
Both of these methods produce a tiny file, which cannot possibly be a compressed archive made of my entire eMMC. When I copy back using bzip2 and dd, the content of the storage is no longer readable. Other than using bash utilities through subprocess, what would be the correct pythonic way of doing it?

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Extracting a text from very large wav file using speech recognition - python-3.x

Related

cv2 wait key (Hoping to make it variable depending on latency)

why speech_recognition is no longer working?

Audio to text is slow and words are getting dropped

Python 3.7 text-to-speech only playing one audio file, sometimes no audio at all, using IDLE editor

How to create a compressed archive of entire storage unit in Python?

Categories

Resources