Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix typos #942

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions paddlemix/datacopilot/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ from paddlemix.datacopilot.core import MMDataset, SCHEMA
import paddlemix.datacopilot.ops as ops
```

## 四、核心概念
## 四、核心概念
工具核心概念包括Schema和Dataset。Schema用于定义多模态数据组织结构和字段名字。MMDataset作为数据操作的核心类,为存储,查看,转换,生成等操作的基本对象。

### SCHEMA
Expand All @@ -40,7 +40,7 @@ class SCHEMA(Enum):

### DATASET

核心类MMDeteset,为存储,查看,转换,生成等操作的基本对象。支持基本的运算(切片,加法,遍历等操作)。支持json数据源。内置map,filter函数,用于高效处理数据,支持多进程和多线程并发功能。支持链式调用,方便组合多种原子操作以实现复杂的功能。通过以map函数为接口实现对数据集每个元素的处理,通过register注册机制可灵活新增作用于整个数据集的通用操作功能。
核心类MMDataset,为存储,查看,转换,生成等操作的基本对象。支持基本的运算(切片,加法,遍历等操作)。支持json数据源。内置map,filter函数,用于高效处理数据,支持多进程和多线程并发功能。支持链式调用,方便组合多种原子操作以实现复杂的功能。通过以map函数为接口实现对数据集每个元素的处理,通过register注册机制可灵活新增作用于整个数据集的通用操作功能。
```
'from_auto',
'from_json',
Expand Down Expand Up @@ -184,7 +184,7 @@ sampled_dataset = dataset.sample(10) # 随机抽取10个样本
```

## 七、使用案例
1. 导入导出
1. 导入导出
```
import functools
from paddlemix.datacopilot.core import MMDataset, SCHEMA
Expand All @@ -196,9 +196,9 @@ print(len(dataset))
dataset.export_json('/path/to/your/output/file.json')
```

2. 字段处理
2. 字段处理
```
# custom code
# custom code
def update_url(item: T) -> T: ...

def augment_prompt(item: T) -> T: ...
Expand All @@ -216,9 +216,9 @@ dataset = dataset.map(update_url).map(augment_prompt)
dataset = dataset.filter(is_wanted).nonempty()
```

3. LLaVA-SFT训练
3. LLaVA-SFT训练
数据准备和训练流程参考项目[pp_cap_instruct](https://aistudio.baidu.com/projectdetail/7917712)

## 八、总结
**DataCopilot** 是 **PaddleMIX** 提供的一个强大且灵活的多模态数据处理工具箱。
通过掌握其基本操作和高级功能,你可以高效地处理、增强和转换多模态数据,为后续的模型训练和推理提供有力支持。
通过掌握其基本操作和高级功能,你可以高效地处理、增强和转换多模态数据,为后续的模型训练和推理提供有力支持。
30 changes: 15 additions & 15 deletions paddlemix/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,14 @@ class MapDataset(Dataset):

def __init__(self, data, **kwargs):
self.data = data
self._transform_pipline = []
self._transform_pipeline = []
self.new_data = self.data
self.info = kwargs
self.label_list = self.info.pop("label_list", None)
self.vocab_info = self.info.pop("vocab_info", None)

def _transform(self, data):
for fn in self._transform_pipline:
for fn in self._transform_pipeline:
data = fn(data)
return data

Expand All @@ -256,7 +256,7 @@ def __getitem__(self, idx):
Basic function of `MapDataset` to get sample from dataset with a given
index.
"""
return self._transform(self.new_data[idx]) if self._transform_pipline else self.new_data[idx]
return self._transform(self.new_data[idx]) if self._transform_pipeline else self.new_data[idx]

def __len__(self):
"""
Expand Down Expand Up @@ -380,7 +380,7 @@ def _map(self, fn, lazy=True, batched=False):
if batched:
self.new_data = fn(self.new_data)
elif lazy:
self._transform_pipline.append(fn)
self._transform_pipeline.append(fn)
else:
self.new_data = [fn(self.new_data[idx]) for idx in range(len(self.new_data))]
return self
Expand All @@ -403,22 +403,22 @@ class IterDataset(IterableDataset):

def __init__(self, data, **kwargs):
self.data = data
self._transform_pipline = []
self._filter_pipline = []
self._transform_pipeline = []
self._filter_pipeline = []

self.label_list = kwargs.pop("label_list", None)
self.vocab_info = kwargs.pop("vocab_info", None)

def _transform(self, data):
for fn in self._transform_pipline:
for fn in self._transform_pipeline:
data = fn(data)
return data

def _shard_filter(self, num_samples):
return True

def _filter(self, data):
for fn in self._filter_pipline:
for fn in self._filter_pipeline:
if not fn(data):
return False
return True
Expand All @@ -430,19 +430,19 @@ def __iter__(self):
num_samples = 0
if inspect.isfunction(self.data):
for example in self.data():
if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
if (not self._filter_pipeline or self._filter(self._filter_pipeline)) and self._shard_filter(
num_samples=num_samples
):
yield self._transform(example) if self._transform_pipline else example
yield self._transform(example) if self._transform_pipeline else example
num_samples += 1
else:
if inspect.isgenerator(self.data):
warnings.warn("Reciving generator as data source, data can only be iterated once")
warnings.warn("Receiving generator as data source, data can only be iterated once")
for example in self.data:
if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
if (not self._filter_pipeline or self._filter(self._filter_pipeline)) and self._shard_filter(
num_samples=num_samples
):
yield self._transform(example) if self._transform_pipline else example
yield self._transform(example) if self._transform_pipeline else example
num_samples += 1

def filter(self, fn):
Expand All @@ -455,7 +455,7 @@ def filter(self, fn):
returns a boolean. Samples that return False are discarded.
"""

self._filter_pipline.append(fn)
self._filter_pipeline.append(fn)

return self

Expand Down Expand Up @@ -495,7 +495,7 @@ def map(self, fn):
sample as argument.
"""

self._transform_pipline.append(fn)
self._transform_pipeline.append(fn)

return self

Expand Down
4 changes: 2 additions & 2 deletions paddlemix/examples/qwen2_vl/Qwen2_VL_run.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -316,10 +316,10 @@
"outputs": [],
"source": [
"# 2B\n",
"sh paddlemix/examples/qwen2_vl/shell/basline_2b_bs32_1e8.sh\n",
"sh paddlemix/examples/qwen2_vl/shell/baseline_2b_bs32_1e8.sh\n",
"\n",
"# 7B\n",
"sh paddlemix/examples/qwen2_vl/shell/basline_7b_bs32_1e8.sh"
"sh paddlemix/examples/qwen2_vl/shell/baseline_7b_bs32_1e8.sh"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions paddlemix/examples/qwen2_vl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,16 @@ wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/opensource_js

```bash
# 2B
sh paddlemix/examples/qwen2_vl/shell/basline_2b_bs32_1e8.sh
sh paddlemix/examples/qwen2_vl/shell/baseline_2b_bs32_1e8.sh

# 2B lora
sh paddlemix/examples/qwen2_vl/shell/basline_2b_lora_bs32_1e8.sh
sh paddlemix/examples/qwen2_vl/shell/baseline_2b_lora_bs32_1e8.sh

# 7B
sh paddlemix/examples/qwen2_vl/shell/basline_7b_bs32_1e8.sh
sh paddlemix/examples/qwen2_vl/shell/baseline_7b_bs32_1e8.sh

# 7B lora
sh paddlemix/examples/qwen2_vl/shell/basline_7b_lora_bs32_1e8.sh
sh paddlemix/examples/qwen2_vl/shell/baseline_7b_lora_bs32_1e8.sh
```

注意:微调2b模型的运行示例如下:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3

OUTPUT_DIR='work_dirs/basline_330k_2b_bs32_1e8'
OUTPUT_DIR='work_dirs/baseline_330k_2b_bs32_1e8'

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3

OUTPUT_DIR='work_dirs/basline_330k_2b_bs32_1e8'
OUTPUT_DIR='work_dirs/baseline_330k_2b_bs32_1e8'

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3

OUTPUT_DIR='work_dirs/basline_330k_7b_bs32_1e8'
OUTPUT_DIR='work_dirs/baseline_330k_7b_bs32_1e8'

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3

OUTPUT_DIR='work_dirs/basline_330k_7b_bs32_1e8'
OUTPUT_DIR='work_dirs/baseline_330k_7b_bs32_1e8'

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
Expand Down