7
votes

im trying to use my python app to transcribe multiple files in a folder and speed up the process. At present I am able to do it one file at a time -

####RUN THIS PART FIRST#########
import json
from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import threading
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
authenticator = IAMAuthenticator('xxyyzz')

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')

models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))

# This is the name of the file u need to change below
with open(join(dirname('__file__'), 'Call 8.wav'),
          'rb') as audio_file:
#    print(json.dumps(
    output = service.recognize(
    audio=audio_file,
    speaker_labels=True,
    content_type='audio/wav',
    #timestamps=True,
    #word_confidence=True,
    inactivity_timeout = -1,
    model='en-US_NarrowbandModel',
    continuous=True).get_result(),
    indent=2
  ############END################################  

# get data to a csv
########################RUN THIS PART SECOND#####################################
df0 = pd.DataFrame([i for elts in output for alts in elts['results'] for i in alts['alternatives']])

df1 = pd.DataFrame([i for elts in output for i in elts['speaker_labels']])

list(df0.columns) 
list(df1.columns) 
df0 = df0.drop(["timestamps"], axis=1)
df1 = df1.drop(["final"], axis=1)
df1 = df1.drop(['confidence'],axis=1)
test3 = pd.concat([df0, df1], axis=1)
#sentiment
transcript = test3['transcript']
transcript = transcript.dropna()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
text = transcript
scores = []
for txt in text:
    vs = analyzer.polarity_scores(txt)
    scores.append(vs)
data = pd.DataFrame(text, columns= ['Text'])
data2 = pd.DataFrame(scores)
final_dataset= pd.concat([data,data2], axis=1)
test4 = pd.concat([test3,final_dataset], axis=1)
test4 = test4.drop(['Text'],axis=1)
test4.rename(columns={'neg':'Negative'}, 
                 inplace=True)
test4.rename(columns={'pos':'Positive'}, 
                 inplace=True)
test4.rename(columns={'neu':'Neutral'}, 
                 inplace=True)

# This is the name of the output csv file
test4.to_csv("Call 8.csv")

How can i do this to transcribe multiple files in a folder instead of one file at a time?I can run this script multiple times but i want to automate it such that it picks up wav files from a folder and runs it. lets say I have 15 audio wav files in my folder C:\Python. I want to make it an automated process where it will run the script and get 15 csvs. 1 for each with their resp. outputs. right now this script works but have to manually run it for each wav file to get each wavs output csv.

Also,as a second question(sorry!), is there a way to speed up the transcription? breakup the wav files into smaller segments and send to watson but it didnt work. My reference was - (https://github.com/freelanceastro/interview-transcriber)

4
Add what you have tried, e.g. have you send multiple requests in parallel? If yes, what is the error you are seeing?data_henrik
I havent sent in parallel. How can I do it for multiple files in a folder?user12384956
Python has some multi-processing / async-processing modules and features.data_henrik

4 Answers

0
votes

have you tried running this script multiple times? you could write a wrapper that launches this script in a subprocess kinda like this:

import subprocess
import sys

processes = []
for _ in range(5):
    processes.append(subprocess.Popen([sys.executable, "/path/to/script.py"]))

# now wait for them to finish
for process in processes:
    process.wait()
0
votes

Seems like you want to find all .wav files in a directory and process each in turn.

import os

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.wav'):
        with open(filename, 'rb') as audio_file:


You could even extend it so that it keeps running and only processes new files.

0
votes

You can just try to turn your code into a function, scanning for all files with the .wav extension in your current directory (using os like some previously mentioned or glob), and calling this function for every file. It would result in something like this:

####RUN THIS PART FIRST#########
import json
from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import threading
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
import glob

authenticator = IAMAuthenticator('xxyyzz')

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')

models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))

def transcribe(infile, service):
    # This is the name of the file u need to change below
    with open(infile,'rb') as audio_file:
    #    print(json.dumps(
        output = service.recognize(
        audio=audio_file,
        speaker_labels=True,
        content_type='audio/wav',
        #timestamps=True,
        #word_confidence=True,
        inactivity_timeout = -1,
        model='en-US_NarrowbandModel',
        continuous=True).get_result(),
        indent=2
      ############END################################  

    # get data to a csv
    ########################RUN THIS PART SECOND#####################################
    df0 = pd.DataFrame([i for elts in output for alts in elts['results'] for i in alts['alternatives']])

    df1 = pd.DataFrame([i for elts in output for i in elts['speaker_labels']])

    list(df0.columns) 
    list(df1.columns) 
    df0 = df0.drop(["timestamps"], axis=1)
    df1 = df1.drop(["final"], axis=1)
    df1 = df1.drop(['confidence'],axis=1)
    test3 = pd.concat([df0, df1], axis=1)
    #sentiment
    transcript = test3['transcript']
    transcript = transcript.dropna()
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    text = transcript
    scores = []
    for txt in text:
        vs = analyzer.polarity_scores(txt)
        scores.append(vs)
    data = pd.DataFrame(text, columns= ['Text'])
    data2 = pd.DataFrame(scores)
    final_dataset= pd.concat([data,data2], axis=1)
    test4 = pd.concat([test3,final_dataset], axis=1)
    test4 = test4.drop(['Text'],axis=1)
    test4.rename(columns={'neg':'Negative'}, 
                     inplace=True)
    test4.rename(columns={'pos':'Positive'}, 
                     inplace=True)
    test4.rename(columns={'neu':'Neutral'}, 
                     inplace=True)

    # This is the name of the output csv file
    test4.to_csv(infile[:-4] + ".csv")

for i in glob.glob("*.wav"):
    transcribe(i, service)
0
votes

I think I might have something:

import os
import json
import time
# import threading
from pathlib import Path

import concurrent.futures

# from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd

# Replace with your api key.
my_api_key = "abc123"

# You can add a directory path to Path() if you want to run
# the project from a different folder at some point.
directory = Path().absolute()


authenticator = IAMAuthenticator(my_api_key)

service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url('https://api.us-east.speech-to-text.watson.cloud.ibm.com')
# I used this URL.
# service.set_service_url('https://stream.watsonplatform.net/speech-to-text/api') 


models = service.list_models().get_result()
#print(json.dumps(models, indent=2))

model = service.get_model('en-US_BroadbandModel').get_result()
#print(json.dumps(model, indent=2))



# get data to a csv
########################RUN THIS PART SECOND#####################################


def process_data(json_data, output_path):

    print(f"Processing: {output_path.stem}")

    cols = ["transcript", "confidence"]

    dfdata = [[t[cols[0]], t[cols[1]]] for r in json_data.get('results') for t in r.get("alternatives")]

    df0 = pd.DataFrame(data = dfdata, columns = cols)

    df1 = pd.DataFrame(json_data.get("speaker_labels")).drop(["final", "confidence"], axis=1)


    # test3 = pd.concat([df0, df1], axis=1)
    test3 = pd.merge(df0, df1, left_index = True, right_index = True)


    # sentiment
    print(f"Getting sentiment for: {output_path.stem}")
    transcript = test3["transcript"]
    transcript.dropna(inplace=True)

    analyzer = SentimentIntensityAnalyzer()
    text = transcript
    scores = [analyzer.polarity_scores(txt) for txt in text]

    # data = pd.DataFrame(text, columns = ["Text"])
    data = transcript.to_frame(name="Text")
    data2 = pd.DataFrame(scores)


    # final_dataset= pd.concat([data, data2], axis=1)
    final_dataset = pd.merge(data, data2, left_index = True, right_index = True)

    # test4 = pd.concat([test3, final_dataset], axis=1)
    test4 = pd.merge(test3, final_dataset, left_index = True, right_index = True)

    test4.drop("Text", axis=1, inplace=True)

    test4.rename(columns = {
            "neg": "Negative",
            "pos": "Positive",
            "neu": "Neutral",
            }, inplace=True)

    # This is the name of the output csv file
    test4.to_csv(output_path, index = False)


def process_audio_file(filename, output_type = "csv"):

    audio_file_path = directory.joinpath(filename)

    # Update output path to consider `output_type` parameter.
    out_path = directory.joinpath(f"{audio_file_path.stem}.{output_type}")

    print(f"Current file: '{filename}'")

    with open(audio_file_path, "rb") as audio_file:
        data = service.recognize(
                audio = audio_file,
                speaker_labels = True,
                content_type = "audio/wav",
                inactivity_timeout = -1,
                model = "en-US_NarrowbandModel",
                continuous = True,
            ).get_result()

    print(f"Speech-to-text complete for: '{filename}'")

    # Return data and output path as collection.
    return [data, out_path]


def main():
    print("Running main()...")

    # Default num. workers == min(32, os.cpu_count() + 4)
    n_workers = os.cpu_count() + 2

    # Create generator for all .wav files in folder (and subfolders).
    file_gen = directory.glob("**/*.wav")

    with concurrent.futures.ThreadPoolExecutor(max_workers = n_workers) as executor:
        futures = {executor.submit(process_audio_file, f) for f in file_gen}
        for future in concurrent.futures.as_completed(futures):
            pkg = future.result()
            process_data(*pkg)


if __name__ == "__main__":

    print(f"Program to process audio files has started.")

    t_start = time.perf_counter()

    main()

    t_stop = time.perf_counter()
    print(f"Done! Processing completed in {t_stop - t_start} seconds.")