Transcribe screen recordings using Gemini AI
import json
import time
import requests
from google import genai
# Complete the values
my_config = {
"GEMINI_API_KEY": "...",
}
print("Environment variables loaded.")
genai_client = genai.Client(api_key=my_config["GEMINI_API_KEY"])
recording_file_path = "./data/screen_recording.mp4"
prompt = """You are an AI assistant tasked with extracting information from a video recording, which is provided to you.
The video recording contains web pages from a website that allows users to bet on future events.
Some pages show an overview of multiple bets available in a specific category, and the current odds of each outcome.
Other pages show more detail about a specific bet, its possible outcomes and their odds.
When the bet is just a yes/no question, the odds are displayed in the adjoining text "x% chance" where x is the percentage chance of Yes.
You are interested in collecting the list of all available bets, the category that they belong to, the list of possible outcomes and their current odds.
Make sure that your response includes every single bet shown in the video, whether it is shown on an overview page or on a detail page.
If you can only see partial data regarding a bet's outcomes and current odds, just include the data that you can.
If a bet appears on multiple video frames, you should only include it once.
You must format your response as a list of bets in JSON format. Each bet should be a JSON object in the following format:
<example_object>
{
"name": "Example Bet",
"category": "Politics",
"outcomes": [
{
"name": "Outcome 1",
"probability": 0.5
},
{
"name": "Outcome 2",
"probability": 0.3
},
{
"name": "Outcome 3",
"probability": 0.2
}
# Etc.
]
}
</example_object>
Ensure that your output is a list of objects in the example_format, with nothing else.
"""
# Utility function
def process_video(local_file_path: str, prompt: str):
if local_file_path == "":
raise ValueError("File path cannot be empty.")
if prompt == "":
raise ValueError("Prompt cannot be empty.")
print("Processing video", local_file_path)
print("Uploading file...")
video_file = genai_client.files.upload(file=local_file_path)
print(f"Completed upload: {video_file.uri}")
# Check whether the file is ready to be used.
while video_file.state.name == "PROCESSING":
print(".", end="")
time.sleep(1)
video_file = genai_client.files.get(name=video_file.name)
if video_file.state.name == "FAILED":
raise ValueError(video_file.state.name)
print("The video file is ready to be processed.")
# Pass the video file reference like any other media part.
print("Prompt:", prompt)
genai_response = genai_client.models.generate_content(
model="gemini-1.5-pro", contents=[video_file, prompt]
)
genai_text = genai_response.text
return genai_text
def clean_up_output(transcription: str):
clean_up_prompt = """You are an AI assistant tasked with cleaning up the output of an tool that extracts information from a video recording.
You are given an input between the <input> tags below, which is a JSON list of bets. Each bet is an object containing the name of the bet (name), the category of the bet (category), and alist of possible outcomes (outcomes).
Each possible outcome is an object containing the name of the outcome (name) and the current odds of the outcome as a number between 0 and 1(probability).
Please perform the following clean up tasks:
* If a single bet is included multiple times, merge the data in the way you see fit.
* If a bet does not have any outcomes, remove it from the list.
Ensure that your output is a list of objects in the same JSON format as the input, with nothing else.
Here is the input:
"""
genai_response = genai_client.models.generate_content(
model="gemini-1.5-pro", contents=clean_up_prompt + "<input>\n" + transcription + "\n</input>"
)
genai_text = genai_response.text
return genai_text
# Let's process the video
transcription_output1 = process_video(local_file_path=recording_file_path, prompt=prompt)
transcription_output2 = clean_up_output(transcription=transcription_output1)
Displaying the final result
ouput_clean = transcription_output2.replace("```json", "").replace("```", "").replace("\n", " ")
output_dict = json.loads(ouput_clean)
print(json.dumps(output_dict, indent=2))
formatted_text = json.dumps(output_dict, indent=4)
with open("./data/output.json", "w") as f:
f.write(formatted_text)