-
Notifications
You must be signed in to change notification settings - Fork 80
Open
Labels
enhancementNew feature or requestNew feature or request
Description
🚀 Feature
Notes from @tchaton : I wonder if we could do sliced videos loading
It is more like you would doVideo(filepath, start_time=..., end_time=...)and we take care of the rest
Sample code
import objstore
import ffmpeg
from pymp4parse.parser import Box
import io
import sys
def get_gcs_video_slice_with_objstore(bucket_name, video_path, output_path, start_time, end_time):
"""
Efficiently slices a video from GCS using objstore and pymp4parse
by downloading only the necessary byte ranges.
Args:
bucket_name (str): The name of the GCS bucket.
video_path (str): The path to the video file in the bucket.
output_path (str): The local path to save the sliced video.
start_time (float): The start time of the slice in seconds.
end_time (float): The end time of the slice in seconds.
"""
try:
# 1. Connect to GCS with objstore
print(f"Connecting to GCS bucket: {bucket_name}")
store = objstore.ObjStore(f"gs://{bucket_name}")
meta = store.get_meta(video_path)
file_size = meta.size
print(f"Successfully connected. Video size: {file_size} bytes.")
# 2. Locate and fetch the 'moov' atom
print("Locating and fetching 'moov' atom (metadata)...")
moov_tail_data = store.get_range(video_path, file_size - 8, file_size)
moov_size = int.from_bytes(moov_tail_data[:4], 'big')
moov_offset = file_size - moov_size
moov_data = store.get_range(video_path, moov_offset, file_size)
print(f"'moov' atom found at offset {moov_offset} with size {moov_size}.")
# 3. Fetch the 'ftyp' atom from the beginning of the file
print("Fetching 'ftyp' atom (file type)...")
ftyp_header = store.get_range(video_path, 0, 8)
ftyp_size = int.from_bytes(ftyp_header[:4], 'big')
ftyp_data = store.get_range(video_path, 0, ftyp_size)
print(f"'ftyp' atom has size {ftyp_size}.")
# 4. Parse the moov atom to find media chunks
print("Parsing metadata with pymp4parse...")
Box.parse(moov_data)
# 5. Download the actual video data chunk ('mdat')
# NOTE: A fully optimized implementation would parse the 'stco', 'stsz', etc.,
# tables from the 'moov' atom to find the *exact* byte ranges for the
# desired time slice. This would avoid downloading the entire mdat.
# For simplicity, this example downloads the whole mdat chunk.
mdat_start_offset = ftyp_size
print(f"Downloading video data (mdat) starting from offset {mdat_start_offset}...")
video_data_chunk = store.get_range(video_path, mdat_start_offset)
print("Video data downloaded.")
# 6. Reconstruct an in-memory MP4 and pipe to FFmpeg
reconstructed_mp4 = ftyp_data + moov_data + video_data_chunk
print(f"Slicing video from {start_time}s to {end_time}s with FFmpeg...")
input_stream = ffmpeg.input('pipe:', format='mp4')
output_stream = ffmpeg.output(
input_stream,
output_path,
ss=start_time,
to=end_time,
c='copy' # Use stream copy for speed, as we are not re-encoding
).overwrite_output()
process = ffmpeg.run_async(output_stream, pipe_stdin=True, quiet=True)
process.stdin.write(reconstructed_mp4)
process.stdin.close()
process.wait()
print(f"Successfully sliced video and saved to {output_path}")
except ImportError:
print("Missing required libraries. Please run:", file=sys.stderr)
print("pip install objstore 'objstore[gcs]' pymp4parse ffmpeg-python", file=sys.stderr)
sys.exit(1)
except objstore.gcs.GCSObjectNotFound:
print(f"Error: The video '{video_path}' was not found in bucket '{bucket_name}'.", file=sys.stderr)
sys.exit(1)
except ffmpeg.Error as e:
print("An FFmpeg error occurred:", file=sys.stderr)
print(e.stderr.decode(), file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
# --- Configuration ---
# Using a public GCS bucket with a sample video for demonstration
BUCKET_NAME = "gcs-samples-bucket"
VIDEO_PATH = "video/big_buck_bunny.mp4"
OUTPUT_PATH = "sliced_video_objstore.mp4"
START_TIME = 10 # seconds
END_TIME = 20 # seconds
# ---------------------
get_gcs_video_slice_with_objstore(BUCKET_NAME, VIDEO_PATH, OUTPUT_PATH, START_TIME, END_TIME)Bookmarking for videos streaming:
sudo apt-get install -y \
python3-gi \
python3-gst-1.0 \
gir1.2-gstreamer-1.0 \
gir1.2-gst-plugins-base-1.0 \
gstreamer1.0-tools \
gstreamer1.0-plugins-base \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
gstreamer1.0-libavimport gi
import sys
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List
gi.require_version('Gst', '1.0')
from gi.repository import Gst, GLib
class AsyncVideoSlicer:
"""
An asynchronous class to handle video slicing from S3 using GStreamer.
It yields raw video frames as NumPy arrays from a specified time slice.
This version is designed to work with asyncio.
"""
def __init__(self, executor: ThreadPoolExecutor):
Gst.init(None)
self.executor = executor
# These will be set for each call
self.pipeline = None
self.main_loop = GLib.MainLoop()
self.future_result = None
self.event_loop = None
def _on_new_sample(self, sink) -> Gst.FlowReturn:
# This callback runs in the GLib thread
sample = sink.pull_sample()
if not sample:
return Gst.FlowReturn.ERROR
buf = sample.get_buffer()
caps = sample.get_caps()
height = caps.get_structure(0).get_value("height")
width = caps.get_structure(0).get_value("width")
frame_data = buf.extract_dup(0, buf.get_size())
frame = np.ndarray((height, width, 3), buffer=frame_data, dtype=np.uint8)
# We need a way to pass this frame back to the main thread's list
if self.future_result and not self.future_result.done():
# Get the current list, append, and update
current_frames = self.future_result.get_extra_info("frames")
current_frames.append(frame)
return Gst.FlowReturn.OK
def _set_future_result(self, result):
"""Thread-safe way to set the result on the asyncio future."""
if self.future_result and not self.future_result.done():
self.event_loop.call_soon_threadsafe(self.future_result.set_result, result)
def _on_bus_message(self, bus, message):
"""Bus message handler, runs in the GLib thread."""
msg_type = message.type
if msg_type == Gst.MessageType.EOS or msg_type == Gst.MessageType.ERROR:
if msg_type == Gst.MessageType.ERROR:
err, debug = message.parse_error()
print(f"Error in GStreamer: {err.message} ({debug})")
# Retrieve the collected frames
frames = self.future_result.get_extra_info("frames")
self._set_future_result(frames)
self.main_loop.quit()
def _run_gstreamer_loop(self):
"""This is the blocking function that will run in a separate thread."""
try:
self.main_loop.run()
except Exception as e:
print(f"Exception in GStreamer thread: {e}")
self._set_future_result([]) # Return empty list on error
finally:
self.pipeline.set_state(Gst.State.NULL)
async def get_slice(self, bucket_name: str, key: str, start_seconds: int, duration_seconds: int) -> List[np.ndarray]:
"""
Asynchronously builds and runs the pipeline to retrieve a slice of video frames.
"""
self.event_loop = asyncio.get_running_loop()
self.future_result = self.event_loop.create_future()
self.future_result.set_extra_info("frames", []) # Attach a list to the future
pipeline_str = (
f"s3src bucket={bucket_name} key={key} ! decodebin name=dec "
f"dec. ! queue ! videoconvert ! video/x-raw,format=RGB ! appsink name=videosink emit-signals=true sync=false "
f"dec. ! queue ! fakesink"
)
self.pipeline = Gst.parse_launch(pipeline_str)
appsink = self.pipeline.get_by_name("videosink")
appsink.connect("new-sample", self._on_new_sample)
bus = self.pipeline.get_bus()
bus.add_signal_watch()
bus.connect("message", self._on_bus_message)
# Start the pipeline and seek
self.pipeline.set_state(Gst.State.PLAYING)
self.pipeline.seek_simple(
Gst.SeekFlags.FLUSH | Gst.SeekFlags.ACCURATE,
start_seconds * Gst.SECOND
)
print(f"Slicing {key} from {start_seconds}s for {duration_seconds}s...")
# Schedule a timeout to kill the pipeline if it gets stuck
loop_killer_task = self.event_loop.create_task(
self._timeout_killer(duration_seconds + 5) # 5s buffer
)
# Run the blocking GLib main loop in the thread pool
# and wait for it to complete.
await self.event_loop.run_in_executor(
self.executor, self._run_gstreamer_loop
)
# Clean up the timeout task
loop_killer_task.cancel()
# The future is set by the bus message handler when EOS/Error occurs.
return await self.future_result
async def _timeout_killer(self, timeout_seconds: int):
"""Async task to forcefully stop the GLib loop if it times out."""
await asyncio.sleep(timeout_seconds)
if self.main_loop.is_running():
print("Timeout reached, forcefully stopping GStreamer loop.")
# Retrieve whatever frames we got
frames = self.future_result.get_extra_info("frames")
self._set_future_result(frames)
self.main_loop.quit()
# --- HOW TO RUN ---
async def main():
S3_BUCKET = "your-s3-bucket-name" # <--- CHANGE THIS
VIDEO_KEYS = [
"source-videos/my-awesome-video.mp4",
"source-videos/another-video.mp4",
]
# Use a ThreadPoolExecutor to run the blocking GLib loops
with ThreadPoolExecutor() as executor:
slicer = AsyncVideoSlicer(executor)
# Create concurrent tasks to slice both videos at the same time
tasks = [
slicer.get_slice(S3_BUCKET, key, start_seconds=30, duration_seconds=2)
for key in VIDEO_KEYS
]
# Wait for all tasks to complete
results = await asyncio.gather(*tasks)
for key, frames in zip(VIDEO_KES, results):
if frames:
print(f"Result for {key}: Got {len(frames)} frames. Shape of first frame: {frames[0].shape}")
else:
print(f"Result for {key}: No frames returned.")
if __name__ == '__main__':
# This runs the main async function
asyncio.run(main())
Motivation
Pitch
Alternatives
Additional context
tchaton
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request