Skip to content

Commit

Permalink
[CI]fix requirements&codestyle (#9739)
Browse files Browse the repository at this point in the history
* [CI]fix

* Compatible with paddle.where

* fix

* fix
  • Loading branch information
Liujie0926 authored Jan 6, 2025
1 parent b75ff88 commit 30267d7
Show file tree
Hide file tree
Showing 14 changed files with 19 additions and 16 deletions.
2 changes: 1 addition & 1 deletion llm/experimental/ernie-3.5-se/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class BFloatFInfo:

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)


def scaled_dot_product_attention(
Expand Down
3 changes: 2 additions & 1 deletion paddlenlp/data/data_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = N

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)

# probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
probability_matrix = masked_fill(probability_matrix, special_tokens_mask, value=0.0)
Expand Down Expand Up @@ -816,6 +816,7 @@ def paddle_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
]

def masked_fill(x, mask, value):
mask = mask.astype("bool")
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/experimental/autonlp/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
protobuf==3.20.2
pydantic==1.10.11
pydantic
ray[tune]==2.5.1
hyperopt>=0.2.5
2 changes: 1 addition & 1 deletion paddlenlp/transformers/bloom/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,7 @@ def _prepare_attn_mask(
# Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), zero, neg_inf)
batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/codegen/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def _attn(self, query, key, value, attention_mask=None):
attn_weights = attn_weights / self.scale_attn
mask_value = paddle.to_tensor(-1e4, dtype=attn_weights.dtype)
# Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)

if attention_mask is not None:
# Apply the attention mask
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/gemma/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,7 +1135,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
else:
expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
return expanded_attn_mask

@paddle.jit.not_to_static
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/gptj/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _attn(
# Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
# Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype, place=attn_weights.place)
attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)

attn_weights = attn_weights / self.scale_attn

Expand Down
5 changes: 3 additions & 2 deletions paddlenlp/transformers/mixtral/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def scaled_dot_product_attention(

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)


def is_casual_mask(attention_mask):
Expand Down Expand Up @@ -519,6 +519,7 @@ def forward(self, hidden_states):
# this will be used to easily index which expert is going to be sollicitated.
# shape: [num_experts, top_k, batch_size * seq_len]
expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
expert_mask = expert_mask.to("bool")

# Loop over all available experts in the model and perform the computation on each expert.
for expert_id in range(self.num_experts):
Expand Down Expand Up @@ -1098,7 +1099,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
past_key_values_length=past_key_values_length,
)
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
return expanded_attn_mask

@paddle.jit.not_to_static
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/transformers/qwen2/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def scaled_dot_product_attention(

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)


def is_casual_mask(attention_mask):
Expand Down Expand Up @@ -979,7 +979,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
past_key_values_length=past_key_values_length,
)
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
return expanded_attn_mask

@paddle.jit.not_to_static
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/transformers/qwen2_moe/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def scaled_dot_product_attention(

def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
return paddle.where(mask.to("bool"), y, x)


def is_casual_mask(attention_mask):
Expand Down Expand Up @@ -1121,7 +1121,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
past_key_values_length=past_key_values_length,
)
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
return expanded_attn_mask

@paddle.jit.not_to_static
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/trl/embedding_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from contextlib import nullcontext

import paddle

try:
from paddle.base import core
except:
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ data
wget
huggingface_hub>=0.19.2
tiktoken
tokenizers
tokenizers>=0.21,<0.22
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ jinja2
regex
numpy<=1.26.4
tiktoken
tokenizers<0.21
tokenizers>=0.21,<0.22
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ torch>=1.5
transformers
fast_dataindex
sacremoses
pydantic==1.10.9
pydantic
modelscope
hyperopt
h5py
Expand Down

0 comments on commit 30267d7

Please sign in to comment.