my_config = {
"SERVICE_ACCOUNT_CREDENTIALS": {
"type": "service_account",
# ...
},
"GEMINI_API_KEY": "",
}
# Google Cloud Storage bucket
bucket_name = ""
# Upload your recording in the folder of your notebook, under the "meeting_recording.mp3" name.
source_file_name = "meeting_recording.mp3"
destination_blob_name = source_file_name
from google.oauth2 import service_account # google-auth
from google.cloud import storage # google-cloud-storage
credentials = service_account.Credentials.from_service_account_info(
my_config["SERVICE_ACCOUNT_CREDENTIALS"]
)
storage_client = storage.Client(credentials=credentials)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
gcs_uri = "gs://" + bucket_name + "/" + destination_blob_name
print(f"File {source_file_name} uploaded to {gcs_uri}.")
# We use Speech to Text V1, which supports identification of speakers
from google.cloud import speech # google-cloud-speech
speechclient = speech.SpeechClient(credentials=credentials)
# Transcribe the speech recording stored in Google Storage
# Reference: https://cloud.google.com/speech-to-text/docs/samples/speech-transcribe-diarization-gcs-beta?hl=en#speech_transcribe_diarization_gcs_beta-python
# Examples of language codes
# https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=1,
max_speaker_count=10,
)
recognition_config = speech.RecognitionConfig(
# Reference regarding audio encoding:
# https://cloud.google.com/speech-to-text/docs/encoding
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
# You can finds the sample rate from the file metadata on your laptop
sample_rate_hertz=44100,
language_code="en-US",
diarization_config=diarization_config,
enable_automatic_punctuation=True,
)
# Set the remote path for the audio file
audio = speech.RecognitionAudio(uri=gcs_uri)
print("Transcribing, this will take at least 15 minutes...")
# Use non-blocking call for getting file transcription
operation = speechclient.long_running_recognize(config=recognition_config, audio=audio)
response = operation.result(timeout=30 * 60)
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result
transcript = ""
if response and response.results:
result = response.results[-1]
words_info = result.alternatives[0].words
transcript = ""
speaker = ""
for word_info in words_info:
word = word_info.word
speaker_tag = word_info.speaker_tag
if speaker_tag == speaker:
transcript += word + " "
else:
speaker = speaker_tag
transcript += f"\n\n<Speaker{speaker_tag}> {word} "
print(transcript)
# Summarize
# https://ai.google.dev/gemini-api/docs
from google import genai # google-genai
llm_client = genai.Client(api_key=my_config["GEMINI_API_KEY"])
prompt = f"""Summarize the following text provided between the <text> tags, by generating a Markdown summary in the format provided between the <example> tags.
The Markdown summary consists of two bullet points, 'summary' and 'next steps', each with less than 10 sub-bullet points.
Under summary, please make sure to list the main agreements and decisions reached.
Under next steps, please make sure the list the agreed actions and next steps.
Bullet points must start with a star character.
Sub bullet points must be indented with 4 spaces followed by a * character.
Please do not include any other text in your response, other than the list of bullet points.
<text>
{transcript}
</text>
Here is an illustrative example of the output:
<example>
* Summary
* The meeting participants agreed to pursue business relationships
* Next steps
* The meeting participants agreed to meet again in two weeks.
* They will revert back with names of potential team members within 1 week.
</example>
"""
response = llm_client.models.generate_content(
model="gemini-2.0-flash",
contents=prompt,
)
meeting_summary = response.text
print(meeting_summary)