Skip to content

Commit

Permalink
Add CLI options, Fix multiprocessing/threading
Browse files Browse the repository at this point in the history
  • Loading branch information
ciscorn committed Feb 17, 2023
1 parent 1f0ebdf commit d8c2042
Show file tree
Hide file tree
Showing 8 changed files with 297 additions and 82 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,24 @@ pip3 install mojxml
## コマンドラインインタフェース

```
Usage: mojxml2ogr [OPTIONS] DST_FILE SRC_FILES...
Usage: python -m mojxml [OPTIONS] DST_FILE SRC_FILES...
Convert MoJ XMLs to GeoJSON/GeoPackage/FlatGeobuf/etc.
DST_FILE: output filename (.geojson, .gpkg, .fgb, etc.)
SRC_FILES: one or more .xml/.zip files
Options:
--worker [multiprocess|thread|single]
[default: multiprocess]
-a, --arbitrary Include 任意座標系
-c, --chikugai Include 地区外 and 別図
```

### 使用例
出力形式は拡張子で判断されます。

- 出力形式は拡張子で判断されます。
- 任意座標系のXMLファイルは無視します(今後オプションを追加)。
### 使用例

```bash
# XMLファイルをGeoJSONに変換
Expand Down
37 changes: 34 additions & 3 deletions mojxml/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

import click

from .process import process_file
from .process import process_file, ProcessOptions
from .process.executor import EXECUTOR_MAP


@click.command()
Expand All @@ -16,7 +17,31 @@
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
)
def main(dst_file: Path, src_files: list[Path]) -> None:
@click.option(
"--worker",
type=click.Choice(list(EXECUTOR_MAP.keys())),
default="multiprocess",
show_default=True,
)
@click.option(
"-a",
"--arbitrary",
is_flag=True,
show_default=True,
default=False,
help="Include 任意座標系",
)
@click.option(
"-c",
"--chikugai",
is_flag=True,
show_default=True,
default=False,
help="Include 地区外 and 別図",
)
def main(
dst_file: Path, src_files: list[Path], worker: str, arbitrary: bool, chikugai: bool
) -> None:
"""Convert MoJ XMLs to GeoJSON/GeoPackage/FlatGeobuf/etc.
DST_FILE: output filename (.geojson, .gpkg, .fgb, etc.)
Expand All @@ -30,8 +55,14 @@ def main(dst_file: Path, src_files: list[Path]) -> None:
root_logger.addHandler(handler)
root_logger.setLevel(logging.INFO)

options = ProcessOptions(
executor=EXECUTOR_MAP[worker](),
include_arbitrary_crs=arbitrary,
include_chikugai=chikugai,
)

# Process files
process_file(src_paths=src_files, dst_path=dst_file)
process_file(src_paths=src_files, dst_path=dst_file, options=options)


if __name__ == "__main__":
Expand Down
46 changes: 29 additions & 17 deletions mojxml/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import lxml.etree as et
import pyproj
import shapely

from .constants import CRS_MAP
from .constants import XML_NAMESPACES as _NS
Expand Down Expand Up @@ -123,7 +122,7 @@ def _parse_surfaces(


def _parse_features(
subject_elem: et._Element, surfaces: dict[str, Surface]
subject_elem: et._Element, surfaces: dict[str, Surface], include_chikugai: bool
) -> list[Feature]:
features = []
for fude in subject_elem.iterfind("./筆", _NS):
Expand All @@ -148,26 +147,29 @@ def _parse_features(
"市区町村名": None,
"座標系": None,
"測地系判別": None,
"代表点経度": None,
"代表点緯度": None,
}
geometry = None
for entry in fude:
key = entry.tag.split("}")[1]
if key == "形状":
coordinates = surfaces[entry.attrib["idref"]]
geometry = {"type": "MultiPolygon", "coordinates": coordinates}
rep_point = shapely.MultiPolygon(
(p[0], p[1:]) for p in coordinates
).point_on_surface()
properties["代表点経度"] = rep_point.x
properties["代表点緯度"] = rep_point.y
# rep_point = shapely.MultiPolygon(
# (p[0], p[1:]) for p in coordinates
# ).point_on_surface()
# properties["代表点経度"] = rep_point.x
# properties["代表点緯度"] = rep_point.y
else:
value = entry.text
properties[key] = value

# 地番が地区外や別図の場合はスキップする
chiban = properties.get("地番", "")
if "地区外" in chiban or "別図" in chiban:
continue
if not include_chikugai:
# 地番が地区外や別図の場合はスキップする
chiban = properties.get("地番", "")
if "地区外" in chiban or "別図" in chiban:
continue

features.append(
{"type": "Feature", "geometry": geometry, "properties": properties}
Expand All @@ -176,7 +178,9 @@ def _parse_features(
return features


def parse_raw(content: bytes) -> list[Feature]:
def parse_raw(
content: bytes, include_arbitrary_crs: bool = False, include_chikugai: bool = False
) -> list[Feature]:
"""TODO:"""
doc = et.fromstring(content, None)

Expand All @@ -185,11 +189,9 @@ def parse_raw(content: bytes) -> list[Feature]:
source_crs = CRS_MAP[doc.find("./座標系", _NS).text]

# 任意座標系の場合はスキップ(とりあえず)
if source_crs is None:
if (not include_arbitrary_crs) and source_crs is None:
return []

_logger.info(f"parsing {base_props['地図名']}...")

spatial_elem = doc.find("./空間属性", _NS)
points = _parse_points(spatial_elem)
curves = _parse_curves(spatial_elem, points)
Expand All @@ -199,8 +201,16 @@ def parse_raw(content: bytes) -> list[Feature]:
transformer = pyproj.Transformer.from_crs(
source_crs, "epsg:4326", always_xy=True
)
curve_ids: list[str] = []
xx: list[float] = []
yy: list[float] = []
for curve_id, (x, y) in curves.items():
curves[curve_id] = transformer.transform(y, x)
curve_ids.append(curve_id)
xx.append(x)
yy.append(y)
(xx, yy) = transformer.transform(yy, xx)
for curve_id, x, y in zip(curve_ids, xx, yy):
curves[curve_id] = (x, y)

# 小数点以下9ケタに丸める
for curve_id, (x, y) in curves.items():
Expand All @@ -225,7 +235,9 @@ def parse_raw(content: bytes) -> list[Feature]:
# fude_to_zukakus[fude_id] = zukaku

subject_elem = doc.find("./主題属性", _NS)
features = _parse_features(subject_elem, surfaces)
features = _parse_features(
subject_elem, surfaces, include_chikugai=include_chikugai
)

# XMLのルート要素にある属性情報をFeatureのプロパティに追加する
for feature in features:
Expand Down
55 changes: 0 additions & 55 deletions mojxml/process.py

This file was deleted.

96 changes: 96 additions & 0 deletions mojxml/process/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Convert .xml/.zip files to OGR format."""

import logging
from pathlib import Path
from typing import Iterable, Optional
from dataclasses import dataclass

import fiona

from ..mojzip import MojXMLZipFile
from ..schema import OGR_SCHEMA
from .executor import (
BaseExecutor,
ProcessPoolExecutor,
)

_logger = logging.getLogger(__name__)


@dataclass
class ProcessOptions:
"""Options for processing files"""

driver: Optional[str] = None
executor: Optional[BaseExecutor] = None
include_arbitrary_crs: bool = False
include_chikugai: bool = False


def process_raw(
src_iter: Iterable[bytes],
dst_path: Path,
driver: Optional[str] = None,
executor: Optional[BaseExecutor] = None,
include_arbitrary_crs: bool = False,
include_chikugai: bool = False,
) -> Iterable[tuple[int, int]]: # (num_files, num_features)
"""WIP"""
with fiona.open(
dst_path,
"w",
driver=driver,
schema=OGR_SCHEMA,
crs="EPSG:4326",
) as f:
# Use default executor if not specified
if executor is None:
executor = ProcessPoolExecutor()

num_files = 0
num_features = 0
for features in executor.process(
src_iter,
include_arbitrary_crs=include_arbitrary_crs,
include_chikugai=include_chikugai,
):
f.writerecords(features)
num_files += 1
num_features += len(features)
yield (num_files, num_features)


def _iter_content_xml(src_paths: list[Path]) -> Iterable[bytes]:
"""WIP"""
for src_path in src_paths:
src_path = Path(src_path)
if src_path.suffix == ".xml":
with open(src_path, "rb") as f:
yield f.read()
elif src_path.suffix == ".zip":
with MojXMLZipFile(src_path) as mzf:
yield from mzf.iter_xml_contents()
else:
raise ValueError(f"Unsupported file type: {src_path.suffix}")


def process_file(
src_paths: list[Path], dst_path: Path, options: ProcessOptions
) -> None:
"""WIP"""
num_files = 0
num_features = 0

for (num_files, num_features) in process_raw(
_iter_content_xml(src_paths),
dst_path,
driver=options.driver,
executor=options.executor,
include_arbitrary_crs=options.include_arbitrary_crs,
include_chikugai=options.include_chikugai,
):
if num_files > 0 and num_files % 10 == 0:
_logger.info(
f"{num_files} files processed, {num_features} features written"
)
_logger.info(f"{num_files} files processed, {num_features} features written")
Loading

0 comments on commit d8c2042

Please sign in to comment.