You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "/opt/meituan/spark-3.0/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/opt/meituan/spark-3.0/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/opt/meituan/spark-3.0/python/lib/pyspark.zip/pyspark/rdd.py", line 2596, in pipeline_func
File "/opt/meituan/spark-3.0/python/lib/pyspark.zip/pyspark/rdd.py", line 2596, in pipeline_func
File "/opt/meituan/spark-3.0/python/lib/pyspark.zip/pyspark/rdd.py", line 425, in func
File "/tmp/spark-96ed7bdc-46bd-4538-9495-2460c409d776/MLP643873.py", line 260, in _process
File "./udf_mlp.zip/udf_mlp/mlp_inference_executor.py", line 48, in do_inference
inf_result_list = self.model.generation(poi_ord_seq_list)
File "./udf_user_code.zip/user_model.py", line 23, in generation
output = self.model.encode(prompt_list)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/utils.py", line 1062, in inner
return fn(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/entrypoints/llm.py", line 781, in encode
outputs = self._run_engine(use_tqdm=use_tqdm)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/entrypoints/llm.py", line 942, in _run_engine
step_outputs = self.llm_engine.step()
File "/usr/local/conda/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 1454, in step
outputs = self.model_executor.execute_model(
File "/usr/local/conda/lib/python3.9/site-packages/vllm/executor/gpu_executor.py", line 125, in execute_model
output = self.driver_worker.execute_model(execute_model_req)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/worker/worker_base.py", line 343, in execute_model
output = self.model_runner.execute_model(
File "/usr/local/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/worker/embedding_model_runner.py", line 101, in execute_model
hidden_or_intermediate_states = model_executable(
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 468, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/conda/lib/python3.9/site-packages/vllm/compilation/decorators.py", line 143, in __call__
return self.forward(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 320, in forward
hidden_states, residual = layer(
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 228, in forward
hidden_states = self.self_attn(
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 171, in forward
attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/conda/lib/python3.9/site-packages/vllm/attention/layer.py", line 99, in forward
return self.impl.forward(query,
File "/usr/local/conda/lib/python3.9/site-packages/vllm/attention/backends/xformers.py", line 551, in forward
out = self._run_memory_efficient_xformers_forward(
File "/usr/local/conda/lib/python3.9/site-packages/vllm/attention/backends/xformers.py", line 723, in _run_memory_efficient_xformers_forward
out = xops.memory_efficient_attention_forward(
File "/usr/local/conda/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 376, in memory_efficient_attention_forward
return _memory_efficient_attention_forward(
File "/usr/local/conda/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 486, in _memory_efficient_attention_forward
op = _dispatch_fw(inp, False)
File "/usr/local/conda/lib/python3.9/site-packages/xformers/ops/fmha/dispatch.py", line 135, in _dispatch_fw
return _run_priority_list(
File "/usr/local/conda/lib/python3.9/site-packages/xformers/ops/fmha/dispatch.py", line 76, in _run_priority_list
raise NotImplementedError(msg)
NotImplementedError: No operator found for `memory_efficient_attention_forward` with inputs:
query : shape=(1, 20394, 2, 6, 128) (torch.float32)
key : shape=(1, 20394, 2, 6, 128) (torch.float32)
value : shape=(1, 20394, 2, 6, 128) (torch.float32)
attn_bias : <class 'xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask'>
p : 0.0
`[email protected]` is not supported because:
xFormers wasn't build with CUDA support
dtype=torch.float32 (supported: {torch.float16, torch.bfloat16})
`cutlassF-pt` is not supported because:
xFormers wasn't build with CUDA support
Command
I want to use vllm==0.6.4 to accelerate qwen2-1.5B-instruct model, while I got this bug. My code shows as follows:
`def init(self, params) -> None:
print("begining init model ........")
# self.model = SentenceTransformer(params['model_name'])
self.model = LLM(model=params['model_name'], task="embedding", dtype="float32",
max_model_len=int(params['max_seq_length']), trust_remote_code=True)
self.params = params
def generation(self, batch_data):
emb_res = []
prompt_list = []
for _ in batch_data:
input_str = self.get_prompt(_)
prompt_list.append(input_str)
output = self.model.encode(prompt_list)
embeddings = output.outputs.embedding
return [_[:self.params['emb_size']].tolist() for _ in embeddings]
`
However I still got this error, and for some reasons I cant use python -m xformers.info, code was run in company's platform, I cant use terminal or some tools...
🐛 Bug
Command
I want to use vllm==0.6.4 to accelerate qwen2-1.5B-instruct model, while I got this bug. My code shows as follows:
`def init(self, params) -> None:
print("begining init model ........")
# self.model = SentenceTransformer(params['model_name'])
self.model = LLM(model=params['model_name'], task="embedding", dtype="float32",
max_model_len=int(params['max_seq_length']), trust_remote_code=True)
self.params = params
Environment
torch==2.5.1+cu124
xformers==0.0.28.post3
python==3.10
vllm==0.6.4
GPU:A100-80G
The text was updated successfully, but these errors were encountered: