Audio to text is slow and words are getting dropped - python-3.x

I have a code which takes videos from an input folder, converts it into audio file(.wav) using ffmpeg.
It then converts the audio file to text by recording 30 seconds audio (dura=30) and converting it to text using google translate api.
The problem is that the code takes a lot of time to convert video to text and it drops first two words and some words after every 30 seconds.
import speech_recognition as sr
import sys
import shutil
from googletrans import Translator
from pathlib import Path
import os
import wave
def audio_to_text(self,video_lst,deploy_path,video_path,audio_path):
try:
txt_lst=[]
for video_file in video_lst:
file_part=video_file.split('.')
audio_path_mod = audio_path +'/'+ '.'.join(file_part[:-1])
dir_path=video_path+'.'.join(file_part[:-1])
self.createDirectory(audio_path_mod)
audio_file='.'.join(file_part[:-1])+'.wav'
command_ffmpeg='set PATH=%PATH%;'+deploy_path.replace('config','script')+'audio_video/ffmpeg/bin/'
command='ffmpeg -i '+video_path+'/'+video_file+' '+audio_path_mod+'/'+audio_file
os.system(command_ffmpeg)
os.system(command)
r=sr.Recognizer()
dura=30
lang='en'
wav_filename=audio_path_mod+'/'+audio_file
f = wave.open(wav_filename, 'r')
frames = f.getnframes()
rate = f.getframerate()
audio_duration = frames / float(rate)
final_text_lst=[]
counter=0
with sr.AudioFile(wav_filename) as source:
while counter<audio_duration:
audio=r.record(source,duration=dura)
counter+=dura
try:
str=r.recognize_google(audio)
final_text_lst.append(str)
except Exception as e:
print(e)
print('Text data generated..')
text_path=audio_path_mod+'/'+audio_file.replace('.wav','_audio_text.csv')
with open(text_path, 'w') as f:
f.write(' '.join(final_text_lst))
except Exception as e:
print(e)
Any help/suggestion would be valuable. Thanks in advance.

Related

how to use Google Cloud Translate API for translating bulk data?

I have a csv file of several thousands of rows in multiple languages and I am thinking of using google cloud translate API to translate foreign language text into English. I have used a simple code to find out if everything works properly and the code is running smoothly.
from google.cloud import translate_v2 as translate
from time import sleep
from tqdm.notebook import tqdm
import multiprocessing as mp
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "file path.py"
translate_client = translate.Client()
text = "Good Morning, My Name is X."
target ="ja"
output = translate_client.translate(text, target_language=target)
print(output)
I want to now import csv file (using pandas) and translate the text and save the output as a csv file. But don't know how should I do that. Most of the examples I found stop at translating sample text just like above.
Can anyone suggest how can I do this?
To translate the text in csv file and save the output in same CSV file using Google Cloud Translation API, you can use below code:
import csv
from pathlib import Path
def translate_text(target, text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
import six
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target)
# print(u"Text: {}".format(result["input"]))
# print(u"Translation: {}".format(result["translatedText"]))
# print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
return result["translatedText"]
def main(input_file, translate_to):
"""
Translate a text file and save as a CSV file
using Google Cloud Translation API
"""
input_file_path = Path(input_file)
target_lang = translate_to
output_file_path = input_file_path.with_suffix('.csv')
with open(input_file_path) as f:
list_lines = f.readlines()
total_lines = len(list_lines)
with open(output_file_path, 'w') as csvfile:
my_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
my_writer.writerow(['id', 'original_text', 'translated_text'])
for i, each_line in enumerate(list_lines):
line_id = f'{i + 1:04}'
original_text = each_line.strip('\n') # Strip for the writer(*).
translated_text = translate_text(
target=target_lang,
text=each_line)
my_writer.writerow([line_id, original_text, translated_text]) # (*)
# Progress monitor, non-essential.
print(f"""
{line_id}/{total_lines:04}
{original_text}
{translated_text}""")
if __name__ == '__main__':
origin_file = input('Input text file? >> ')
output_lang = input('Output language? >> ')
main(input_file=origin_file,
translate_to=output_lang)
Example:
Translated text in input file to target language “es”, the output got stored in the same csv file.
Input:
new.csv
How are you doing,Is everything fine there
Do it today
Output:
new.csv
id,original_text,translated_text
0001,"How are you doing,Is everything fine there",¿Cómo estás? ¿Está todo bien allí?
0002,Do it today,Hazlo hoy

Extracting a text from very large wav file using speech recognition

I am having very large wav file ( approx 1 hour audio approx 700 mb) and I want to extract text from this file using python
my code is
from datetime import datetime
start_time = datetime.now()
print("Start Time : " + str(start_time))
import speech_recognition as sr
r = sr.Recognizer()
with sr.AudioFile("audio_chunk_1.wav") as source:
audio = r.record(source)
try:
s = r.recognize_google(audio)
print("Text: "+s)
except Exception as e:
print("Exception: "+str(e))
end_time = datetime.now()
print("End Time : " + str(end_time))
print('Duration: {}'.format(end_time - start_time))
it was giving me an error on my laptop so decided to split files
from pydub import AudioSegment
audio_file= "demo.wav"
audio = AudioSegment.from_wav(audio_file)
audio_chunk=audio[0:300000]
audio_chunk.export( "audio_chunk_{}.wav".format(1), format="wav")
audio_chunk=audio[300001:400000]
audio_chunk.export( "audio_chunk_{}.wav".format(2), format="wav")
audio_chunk=audio[400001:500000]
audio_chunk.export( "audio_chunk_{}.wav".format(3), format="wav")
I have to split wav files randomly to extract text. Can you pl suggest a better way to do it ?
Thanks

Python 3.7 text-to-speech only playing one audio file, sometimes no audio at all, using IDLE editor

I am using the IDLE editor and Python 3.7, and I would like to know why my code is not playing multiple audio files (sequentially) and sometimes not playing audio at all:
import re
import wave
import pyaudio
import _thread
import time
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict:str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ',2)
self._l[key] = re.findall(r"[A-Z]+",val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+",str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
delay=0
for pron in list_pron:
_thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
delay += 0.145
def _play_audio(sound, delay):
try:
time.sleep(delay)
wf = wave.open("sounds/"+sound+".wav", 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
return
except:
pass
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I have a list of audio files that will play in a certain order, depending on what word I type in, when running the code. The code has no errors, but when I run it, when I type in a word, it only plays the first audio file needed (Example: When I type in "buy" it requires these two sounds: "b" and "ie" played together), but it only plays the first sound, "b", and sometimes no sound at all.
Why isn't it playing multiple audio files? I know that lots of people have been having this issue, but haven't been able to solve it.
Thank you for your help in advance, it is greatly appreciated :)

Python add audio to video opencv

I use python cv2 module to join jpg frames into video, but I can't add audio to it. Is it possible to add audio to video in python without ffmpeg?
P.S. Sorry for my poor English
Use ffpyplayer to handle the audio part.
import cv2
import numpy as np
#ffpyplayer for playing audio
from ffpyplayer.player import MediaPlayer
video_path="../L1/images/Godwin.mp4"
def PlayVideo(video_path):
video=cv2.VideoCapture(video_path)
player = MediaPlayer(video_path)
while True:
grabbed, frame=video.read()
audio_frame, val = player.get_frame()
if not grabbed:
print("End of video")
break
if cv2.waitKey(28) & 0xFF == ord("q"):
break
cv2.imshow("Video", frame)
if val != 'eof' and audio_frame is not None:
#audio
img, t = audio_frame
video.release()
cv2.destroyAllWindows()
PlayVideo(video_path)
The sample code will work but you need to play around the cv2.waitKey(28) depending on the speed of your video.
This is how I am reading audio and video frames:
from moviepy.editor import *
from pafy import pafy
if __name__ == '__main__':
video = pafy.new('https://www.youtube.com/watch?v=K_IR90FthXQ')
stream = video.getbest(preftype='mp4')
video = VideoFileClip(stream.url)
audio = video.audio
for t, video_frame in video.iter_frames(with_times=True):
audio_frame = audio.get_frame(t)
print(audio_frame)
print(video_frame)
This code downloads YouTube video and returns raw frames as numpy arrays.
You can pass the file as an argument to the VideoFileClip instead of the URL.
You can use pygame for audio.
You need to initialize pygame.mixer module
And in the loop, add pygame.mixer.music.play()
But for that, you will need to choose audio file as well.
However, I have found better idea! You can use webbrowser module for playing videos (and because it would play on browser, you can hear sounds!)
import webbrowser
webbrowser.open("video.mp4")
import pygame
pygame.mixer.init()
pygame.mixer.music.load(
'c:Your_file')
pygame.mixer.music.play()
while True:
pygame.time.Clock().tick()
#used to show that you can do other stuff while playing audio
print("hi")

Pygame sound file just beeping [duplicate]

I tried pygame for playing wav file like this:
import pygame
pygame.init()
pygame.mixer.music.load("mysound.wav")
pygame.mixer.music.play()
pygame.event.wait()
but It change the voice and I don't know why!
I read this link solutions and can't solve my problem with playing wave file!
for this solution I dont know what should I import?
s = Sound()
s.read('sound.wav')
s.play()
and for this solution /dev/dsp dosen't exist in new version of linux :
from wave import open as waveOpen
from ossaudiodev import open as ossOpen
s = waveOpen('tada.wav','rb')
(nc,sw,fr,nf,comptype, compname) = s.getparams( )
dsp = ossOpen('/dev/dsp','w')
try:
from ossaudiodev import AFMT_S16_NE
except ImportError:
if byteorder == "little":
AFMT_S16_NE = ossaudiodev.AFMT_S16_LE
else:
AFMT_S16_NE = ossaudiodev.AFMT_S16_BE
dsp.setparameters(AFMT_S16_NE, nc, fr)
data = s.readframes(nf)
s.close()
dsp.write(data)
dsp.close()
and when I tried pyglet It give me this error:
import pyglet
music = pyglet.resource.media('mysound.wav')
music.play()
pyglet.app.run()
--------------------------
nima#ca005 Desktop]$ python play.py
Traceback (most recent call last):
File "play.py", line 4, in <module>
music = pyglet.resource.media('mysound.wav')
File "/usr/lib/python2.7/site-packages/pyglet/resource.py", line 587, in media
return media.load(path, streaming=streaming)
File "/usr/lib/python2.7/site-packages/pyglet/media/__init__.py", line 1386, in load
source = _source_class(filename, file)
File "/usr/lib/python2.7/site-packages/pyglet/media/riff.py", line 194, in __init__
format = wave_form.get_format_chunk()
File "/usr/lib/python2.7/site-packages/pyglet/media/riff.py", line 174, in get_format_chunk
for chunk in self.get_chunks():
File "/usr/lib/python2.7/site-packages/pyglet/media/riff.py", line 110, in get_chunks
chunk = cls(self.file, name, length, offset)
File "/usr/lib/python2.7/site-packages/pyglet/media/riff.py", line 155, in __init__
raise RIFFFormatException('Size of format chunk is incorrect.')
pyglet.media.riff.RIFFFormatException: Size of format chunk is incorrect.
AL lib: ReleaseALC: 1 device not closed
You can use PyAudio. An example here on my Linux it works:
#!usr/bin/env python
#coding=utf-8
import pyaudio
import wave
#define stream chunk
chunk = 1024
#open a wav format music
f = wave.open(r"/usr/share/sounds/alsa/Rear_Center.wav","rb")
#instantiate PyAudio
p = pyaudio.PyAudio()
#open stream
stream = p.open(format = p.get_format_from_width(f.getsampwidth()),
channels = f.getnchannels(),
rate = f.getframerate(),
output = True)
#read data
data = f.readframes(chunk)
#play stream
while data:
stream.write(data)
data = f.readframes(chunk)
#stop stream
stream.stop_stream()
stream.close()
#close PyAudio
p.terminate()
Works for me on Windows:
https://pypi.org/project/playsound/
>>> from playsound import playsound
>>> playsound('/path/to/a/sound/file/you/want/to/play.wav')
NOTE: This has a bug in Windows where it doesn't close the stream.
I've added a PR for a fix here:
https://github.com/TaylorSMarks/playsound/pull/53/commits/53240d970aef483b38fc6d364a0ae0ad6f8bf9a0
The reason pygame changes your audio is mixer defaults to a 22k sample rate:
initialize the mixer module
pygame.mixer.init(frequency=22050, size=-16, channels=2, buffer=4096): return None
Your wav is probably 8k. So when pygame plays it, it plays roughly twice as fast. So specify your wav frequency in the init.
Pyglet has some problems correctly reading RIFF headers. If you have a very basic wav file (with exactly a 16 byte fmt block) with no other information in the fmt chunk (like 'fact' data), it works. But it makes no provision for additional data in the chunks, so it's really not adhering to the RIFF interface specification.
PyGame has 2 different modules for playing sound and music, the pygame.mixer module and the pygame.mixer.music module. This module contains classes for loading Sound objects and controlling playback. The difference is explained in the documentation:
The difference between the music playback and regular Sound playback is that the music is streamed, and never actually loaded all at once. The mixer system only supports a single music stream at once.
If you want to play a single wav file, you have to initialize the module and create a pygame.mixer.Sound() object from the file. Invoke play() to start playing the file. Finally, you have to wait for the file to play.
Use get_length() to get the length of the sound in seconds and wait for the sound to finish:
(The argument to pygame.time.wait() is in milliseconds)
import pygame
pygame.mixer.init()
my_sound = pygame.mixer.Sound('mysound.wav')
my_sound.play()
pygame.time.wait(int(my_sound.get_length() * 1000))
Alternatively you can use pygame.mixer.get_busy to test if a sound is being mixed. Query the status of the mixer continuously in a loop:
import pygame
pygame.init()
pygame.mixer.init()
my_sound = pygame.mixer.Sound('mysound.wav')
my_sound.play()
while pygame.mixer.get_busy():
pygame.time.delay(10)
pygame.event.poll()
Windows
winsound
If you are a Windows user,the easiest way is to use winsound.You don't even need to install it.
Not recommended, too few functions
import winsound
winsound.PlaySound("Wet Hands.wav", winsound.SND_FILENAME)
# add winsound.SND_ASYNC flag if you want to wait for it.
# like winsound.PlaySound("Wet Hands.wav", winsound.SND_FILENAME | winsound.SND_ASYNC)
mp3play
If you are looking for more advanced functions, you can try mp3play.
Unluckily,mp3play is only available in Python2 and Windows.
If you want to use it on other platforms,use playsound despite its poor functions.If you want to use it in Python3,I will give you the modified version which is available on Python 3.(at the bottom of the answer)
Also,mp3play is really good at playing wave files, and it gives you more choices.
import time
import mp3play
music = mp3play.load("Wet Hands.wav")
music.play()
time.sleep(music.seconds())
Cross-platform
playsound
Playsound is very easy to use,but it is not recommended because you can't pause or get some infomation of the music, and errors often occurs.Unless other ways doesn't work at all, you may try this.
import playsound
playsound.playsound("Wet Hands.wav", block=True)
pygame
I'm using this code and it works on Ubuntu 22.04 after my test.
If it doesn't work on your machine, consider updating your pygame lib.
import pygame
pygame.mixer.init()
pygame.mixer.music.load("Wet Hands.wav")
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pass
pyglet
This works on Windows but it doesn't work on my Ubuntu, so I can do nothing.
import pyglet
import time
sound = pyglet.media.load("Wet Hands.wav", "Wet Hands.wav")
sound.play()
time.sleep(sound.duration)
Conclusion
It seems that you are using Linux,so playsound may be your choice.My code maybe cannot solve your problem by using pygame and pyglet,because I always use Windows.If none of the solutions work on your machine,I suggest you run the program on Windows...
To other users seeing my answer, I have done many tests among many libraries,so if you are using Windows,you may try mp3play which can play both mp3 and wave files, and mp3play is the most pythonic, easy, light-weight and functional library.
mp3play in Python3
just copy the code below and create a file named mp3play.py in your working directory and paste the content.
import random
from ctypes import windll, c_buffer
class _mci:
def __init__(self):
self.w32mci = windll.winmm.mciSendStringA
self.w32mcierror = windll.winmm.mciGetErrorStringA
def send(self, command):
buffer = c_buffer(255)
command = command.encode(encoding="utf-8")
errorcode = self.w32mci(command, buffer, 254, 0)
if errorcode:
return errorcode, self.get_error(errorcode)
else:
return errorcode, buffer.value
def get_error(self, error):
error = int(error)
buffer = c_buffer(255)
self.w32mcierror(error, buffer, 254)
return buffer.value
def directsend(self, txt):
(err, buf) = self.send(txt)
# if err != 0:
# print('Error %s for "%s": %s' % (str(err), txt, buf))
return err, buf
class _AudioClip(object):
def __init__(self, filename):
filename = filename.replace('/', '\\')
self.filename = filename
self._alias = 'mp3_%s' % str(random.random())
self._mci = _mci()
self._mci.directsend(r'open "%s" alias %s' % (filename, self._alias))
self._mci.directsend('set %s time format milliseconds' % self._alias)
err, buf = self._mci.directsend('status %s length' % self._alias)
self._length_ms = int(buf)
def volume(self, level):
"""Sets the volume between 0 and 100."""
self._mci.directsend('setaudio %s volume to %d' %
(self._alias, level * 10))
def play(self, start_ms=None, end_ms=None):
start_ms = 0 if not start_ms else start_ms
end_ms = self.milliseconds() if not end_ms else end_ms
err, buf = self._mci.directsend('play %s from %d to %d'
% (self._alias, start_ms, end_ms))
def isplaying(self):
return self._mode() == 'playing'
def _mode(self):
err, buf = self._mci.directsend('status %s mode' % self._alias)
return buf
def pause(self):
self._mci.directsend('pause %s' % self._alias)
def unpause(self):
self._mci.directsend('resume %s' % self._alias)
def ispaused(self):
return self._mode() == 'paused'
def stop(self):
self._mci.directsend('stop %s' % self._alias)
self._mci.directsend('seek %s to start' % self._alias)
def milliseconds(self):
return self._length_ms
def __del__(self):
self._mci.directsend('close %s' % self._alias)
_PlatformSpecificAudioClip = _AudioClip
class AudioClip(object):
__slots__ = ['_clip']
def __init__(self, filename):
self._clip = _PlatformSpecificAudioClip(filename)
def play(self, start_ms=None, end_ms=None):
if end_ms is not None and end_ms < start_ms:
return
else:
return self._clip.play(start_ms, end_ms)
def volume(self, level):
assert 0 <= level <= 100
return self._clip.volume(level)
def isplaying(self):
return self._clip.isplaying()
def pause(self):
return self._clip.pause()
def unpause(self):
return self._clip.unpause()
def ispaused(self):
return self._clip.ispaused()
def stop(self):
return self._clip.stop()
def seconds(self):
return int(round(float(self.milliseconds()) / 1000))
def milliseconds(self):
return self._clip.milliseconds()
def load(filename):
"""Return an AudioClip for the given filename."""
return AudioClip(filename)

Resources