Skip to content

Commit

Permalink
Merge pull request #42 from TIGER-AI-Lab/arena
Browse files Browse the repository at this point in the history
Adapting arena build into main build
  • Loading branch information
vinesmsuic authored Aug 18, 2024
2 parents 5c78a10 + 46b25ed commit 4e28b21
Show file tree
Hide file tree
Showing 97 changed files with 86 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/videogen_hub/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.2"
__version__ = "0.1.4a0"
Empty file.
Empty file.
1 change: 0 additions & 1 deletion src/videogen_hub/benchmark/text_guided_t2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def infer_text_guided_vg_bench(
if overwrite_model_outputs or not os.path.exists(dest_file):
print("========> Inferencing", dest_file)
frames = model.infer_one_video(prompt=prompt["prompt_en"])
print("======> frames.shape", frames.shape)

#special_treated_list = ["LaVie", "ModelScope", "T2VTurbo"]
special_treated_list = []
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
1 change: 1 addition & 0 deletions src/videogen_hub/infermodels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .opensora_plan import OpenSoraPlan
from .t2v_turbo import T2VTurbo
from .opensora_12 import OpenSora12
from .cogvideox import CogVideoX

# from .cogvideo import CogVideo # Not supporting CogVideo ATM

Expand Down
48 changes: 48 additions & 0 deletions src/videogen_hub/infermodels/cogvideox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import torch

class CogVideoX:
def __init__(self, weight="THUDM/CogVideoX-2b", device="cuda"):
"""
Initializes the CogVideo model with a specific device.
Args:
device (str, optional): The device to run the model on. Defaults to "cuda".
"""
from diffusers import CogVideoXPipeline

self.pipe = CogVideoXPipeline.from_pretrained(weight).to("cuda")

def infer_one_video(
self,
prompt: str = None,
size: list = [320, 512],
seconds: int = 2,
fps: int = 8,
seed: int = 42,
):
"""
Generates a single video based on the provided prompt and parameters.
Args:
prompt (str, optional): The text prompt to generate the video from. Defaults to None.
size (list, optional): The size of the video as [height, width]. Defaults to [320, 512].
seconds (int, optional): The duration of the video in seconds. Defaults to 2.
fps (int, optional): The frames per second of the video. Defaults to 8.
seed (int, optional): The seed for random number generation. Defaults to 42.
Returns:
torch.Tensor: The generated video as a tensor.
"""

video = self.pipe(prompt=prompt,
guidance_scale=6,
num_frames=seconds * fps,
#height=size[0],
#width=size[1],
num_inference_steps=50,
generator=torch.manual_seed(seed)).frames[0]
from videogen_hub.utils import images_to_tensor
video = video[:-1] # drop the last frame
video = images_to_tensor(video) # parse it back to tensor (T, C, H, W)

return video
4 changes: 2 additions & 2 deletions src/videogen_hub/infermodels/lavie.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def __init__(self, model_path=os.path.join(MODEL_PATH, "lavie"), device="cuda"):
from omegaconf import OmegaConf

snapshot_download(repo_id="Vchitect/LaVie", local_dir=model_path)
snapshot_download(repo_id="CompVis/stable-diffusion-v1-4", local_dir=os.path.join(model_path, "/stable-diffusion-v1-4"))
snapshot_download(repo_id="CompVis/stable-diffusion-v1-4", local_dir=os.path.join(model_path, "stable-diffusion-v1-4"))
snapshot_download(repo_id="stabilityai/stable-diffusion-x4-upscaler",
local_dir=os.path.join(model_path, "/stable-diffusion-x4-upscaler"))
local_dir=os.path.join(model_path, "stable-diffusion-x4-upscaler"))

torch.set_grad_enabled(False)
self.device = device
Expand Down
4 changes: 2 additions & 2 deletions src/videogen_hub/infermodels/opensora_12.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, device="gpu"):
"vae": {
"type": "OpenSoraVAE_V1_2", # Type of the autoencoder
"from_pretrained": "hpcai-tech/OpenSora-VAE-v1.2", # Pretrained model from Hugging Face
"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), # Local cache directory for model weights
#"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), # Local cache directory for model weights
"micro_frame_size": 17,
"micro_batch_size": 4, # Batch size for processing
},
Expand Down Expand Up @@ -89,7 +89,7 @@ def __init__(self, device="gpu"):
hf_hub_download(
repo_id="hpcai-tech/OpenSora-VAE-v1.2",
filename="model.safetensors",
local_dir=self.config.vae.cache_dir,
local_dir=os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"),
)

hf_hub_download(
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
1 change: 1 addition & 0 deletions src/videogen_hub/pipelines/lavie/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions src/videogen_hub/pipelines/lavie/lavie_src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ def decode(self, z, num_frames=None):
x_z_list = []
for i in range(0, z.size(2), self.micro_z_frame_size):
z_bs = z[:, :, i : i + self.micro_z_frame_size]
print("self.micro_frame_size", self.micro_frame_size)
print("num_frames", num_frames)
x_z_bs = self.temporal_vae.decode(z_bs, num_frames=min(self.micro_frame_size, num_frames))
x_z_list.append(x_z_bs)
num_frames -= self.micro_frame_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ def load_model_with_fallback(model, ckpt_path):
if colossal_imported:
ckpt_io = GeneralCheckpointIO()
ckpt_io.load_model(model, ckpt_path)
elif os.path.exists(os.path.join(ckpt_path, 'model' + ".safetensors")):
import safetensors.torch
state_dict = safetensors.torch.load_file(os.path.join(ckpt_path, 'model' + ".safetensors"))
model.load_state_dict(state_dict)
else:
model.load_state_dict(torch_load(os.path.join(ckpt_path, 'model')))

Expand Down
1 change: 1 addition & 0 deletions src/videogen_hub/pipelines/opensora/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 1 addition & 1 deletion src/videogen_hub/pipelines/opensora/scripts/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def main(config=None):
device=device,
additional_args=model_args,
)
samples = vae.decode(samples.to(dtype))
samples = vae.decode(samples.to(dtype), model_args["num_frames"])

# 4.4. save samples
if not use_dist or coordinator.is_master():
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
5 changes: 3 additions & 2 deletions src/videogen_hub/pipelines/videocrafter/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from .funcs import batch_ddim_sampling
from .utils import instantiate_from_config


def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=20230211, help="seed for seed_everything")
Expand Down Expand Up @@ -55,8 +54,10 @@ def __init__(self, arg_list, device, rank: int = 0, gpu_num: int = 1):
self.args = parser.parse_args(args=arg_list)

self.gpu_no, self.gpu_num = rank, gpu_num
_dict = {'model': {'target': 'lvdm.models.ddpm3d.LatentDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'timesteps': 1000, 'first_stage_key': 'video', 'cond_stage_key': 'caption', 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'image_size': [40, 64], 'channels': 4, 'scale_by_std': False, 'scale_factor': 0.18215, 'use_ema': False, 'uncond_type': 'empty_seq', 'use_scale': True, 'scale_b': 0.7, 'unet_config': {'target': 'lvdm.modules.networks.openaimodel3d.UNetModel', 'params': {'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'transformer_depth': 1, 'context_dim': 1024, 'use_linear': True, 'use_checkpoint': True, 'temporal_conv': True, 'temporal_attention': True, 'temporal_selfatt_only': True, 'use_relative_position': False, 'use_causal_attention': False, 'temporal_length': 16, 'addition_attention': True, 'fps_cond': True}}, 'first_stage_config': {'target': 'lvdm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 512, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}}

config = OmegaConf.load(self.args.config)
config = OmegaConf.create(_dict)
#config = OmegaConf.load(self.args.config)

# data_config = config.pop("data", OmegaConf.create())
model_config = config.pop("model", OmegaConf.create())
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
2 changes: 1 addition & 1 deletion src/videogen_hub/pipelines/videocrafter/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import cv2
import torch
import torch.distributed as dist

import os

def count_params(model, verbose=False):
total_params = sum(p.numel() for p in model.parameters())
Expand Down
17 changes: 17 additions & 0 deletions src/videogen_hub/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import torch
from torchvision import transforms
from PIL import Image

def images_to_tensor(image_list):
"""
Parse a list of PIL images and convert them to a PyTorch tensor in shape (T, C, H, W).
"""
transform = transforms.ToTensor()

# Convert each PIL image to tensor and store in a list
tensor_list = [transform(img) for img in image_list]

# Stack the list of tensors along a new dimension to create the final tensor
tensor = torch.stack(tensor_list, dim=0)

return tensor

0 comments on commit 4e28b21

Please sign in to comment.