Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tyr Worker] Process poi and push to asgard s3 #4212

Merged
merged 1 commit into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/debian8/Dockerfile-master
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get remove --yes cmake \

# Python 'requests' package handle its own CA certificate list
# Let's force it to use the OS's list
ENV REQUESTS_CA_BUNDLE /etc/ssl/certs
ENV REQUESTS_CA_BUNDLE /etc/ssl/certs/ca-certificates.crt

# install rustup
ENV RUSTUP_HOME=/usr/local/rustup \
Expand Down
43 changes: 43 additions & 0 deletions source/tyr/tyr/binarisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import datetime
import shutil
from functools import wraps
import subprocess

from flask import current_app
from shapely.geometry import MultiPolygon
Expand All @@ -58,6 +59,8 @@

from tyr.minio import MinioWrapper

from tyr.poi_to_excluded_zones import poi_to_excluded_zones


def unzip_if_needed(filename):
if not os.path.isdir(filename):
Expand Down Expand Up @@ -1233,6 +1236,46 @@ def gtfs2s3(self, instance_config, filename, job_id, dataset_uid):
_inner_2s3(self, "gtfs", instance_config, filename, job_id, dataset_uid)


@celery.task(bind=True)
def poi2asgard(self, instance_config, filename, job_id, dataset_uid):
"""Extract excluded zones and synchronize with"""
job = models.Job.query.get(job_id)
dataset = _retrieve_dataset_and_set_state("poi", job.id)
instance = job.instance
logger = get_instance_logger(instance, task_id=job_id)

excluded_zone_dir = "excluded_zones"
if os.path.isdir(excluded_zone_dir):
shutil.rmtree(excluded_zone_dir)

os.mkdir(excluded_zone_dir)
poi_to_excluded_zones(filename, excluded_zone_dir, instance.name)

try:
with collect_metric("poi2Asgard", job, dataset_uid):
asgard_bucket = current_app.config.get('MINIO_ASGARD_BUCKET_NAME', None)
if not asgard_bucket:
raise Exception("Asgard Bucket is None")

bash_command = (
"env REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt "
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even though it's corrected here: https://github.com/hove-io/navitia/pull/4212/files#diff-9677996dffe09856de00ff00f281b66e799a3187047cbb83bf62efe46d434186R30, I still have to put the env var here and I have absolutely no idea why... otherwise the aws sync wouldn't work...

"aws s3 sync ./{excluded_zone_dir} s3://{asgard_bucket}/excluded_zones".format(
excluded_zone_dir=excluded_zone_dir, asgard_bucket=asgard_bucket
)
)
process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
if error:
raise Exception("Error occurred when putting excluded zones to asgard: {}".format(error))
except:
logger.exception("")
job.state = "failed"
dataset.state = "failed"
raise
finally:
models.db.session.commit()


def _inner_2s3(self, dataset_type, instance_config, filename, job_id, dataset_uid):
job = models.Job.query.get(job_id)
dataset = _retrieve_dataset_and_set_state(dataset_type, job.id)
Expand Down
2 changes: 2 additions & 0 deletions source/tyr/tyr/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@

MINIO_BUCKET_NAME = os.getenv('TYR_MINIO_BUCKET_NAME', None)

MINIO_ASGARD_BUCKET_NAME = os.getenv('TYR_MINIO_ASGARD_BUCKET_NAME', None)

MINIO_USE_IAM_PROVIDER = os.getenv('TYR_MINIO_USE_IAM_PROVIDER', 'true').lower() in ['1', 'true', 'yes']

MINIO_ACCESS_KEY = os.getenv('TYR_MINIO_ACCESS_KEY', None)
Expand Down
68 changes: 68 additions & 0 deletions source/tyr/tyr/poi_to_excluded_zones.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import argparse
import csv
import json
import logging


def poi_to_excluded_zones(poi_file, output_dir, instance_name):
tmp_path = "tmp/poi_{}".format(instance_name)
import zipfile

with zipfile.ZipFile(poi_file, 'r') as zip_ref:
zip_ref.extractall(tmp_path)

excluded_zones = {}
excluded_geometries_ids = {}

# get excluded zones
with open(tmp_path + "/poi_properties.txt") as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in reader:
if row[1].lower() != "excluded_zones":
continue
excluded_zones[row[0]] = json.loads(row[2])

# find geometry id
with open(tmp_path + "/poi.txt") as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in reader:
if row[0] not in excluded_zones:
continue
excluded_geometries_ids[row[0]] = row[7]

if excluded_geometries_ids.keys() != excluded_zones.keys():
logger.error("not all excluded zone's pois are found in poi.txt")
logger.error("excluded_geometries_ids: {}".format(excluded_geometries_ids.keys()))
logger.error("excluded_zones: {}".format(excluded_zones.keys()))

# read geometries
geometries_shapes = {}
with open(tmp_path + "/geometries.txt") as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='"')
for row in reader:
geometries_shapes[row[0]] = row[1]

for poi_id, zones in excluded_zones.items():
geometry_id = excluded_geometries_ids.get(poi_id)
if not geometry_id:
logger.error("{} could not be found in poi.txt".format(row[0]))
shape = geometries_shapes.get(geometry_id)
if not shape:
logger.error("{} could not be found in geometries.txt".format(geometry_id))

for i, zone in enumerate(zones):
output_id = "{}_{}_{}".format(poi_id, i, instance_name)
output = {'id': output_id}
output.update(zone)
Comment on lines +53 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can have a poi with several zones but all with the same shape ? 🤔

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeahhhhh, I admit that this was not well thought through...

the need is for a given shape, we wish it could be excluded based on a list of modes and a list of ranges of date, so the ideal format in my opinion is

{
  "name": "poi:toto",
  "shape": "POLYGON ((...))",
  "excluded_on": [
    {
      "modes": [
        "walking",
        "bike"
      ],
      "periods": [
        {
          "from": "20240708",
          "to": "20240709"
        },
        {
          "from": "202407010",
          "to": "20240711"
        }
      ]
    },
    {
      "modes": [
        "car"
      ],
      "periods": [
        {
          "from": "202407010",
          "to": "202407011"
        }
      ]
    }
  ]
}

Copy link
Contributor

@pbench pbench Feb 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah ok ! you have the same shape but for different modes, or different activation periods, did I get it right ?

output["shape"] = shape
with open(output_dir + "/{}.json".format(output_id), "w") as output_file:
json.dump(output, output_file)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--poi', help='poi zip')
args = parser.parse_args()
logger = logging.getLogger(__name__)

poi_to_excluded_zones(args.poi, "excluded_zones", "dummy_instance")
4 changes: 3 additions & 1 deletion source/tyr/tyr/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
fusio2s3,
gtfs2s3,
zip_if_needed,
poi2asgard,
)
from tyr.binarisation import reload_data, move_to_backupdirectory
from tyr import celery
Expand Down Expand Up @@ -207,7 +208,8 @@ def process_ed2nav():
loki_data_source, instance.name
)
)

if dataset.type == "poi":
actions.append(poi2asgard.si(instance_config, filename, dataset_uid=dataset.uid))
actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid))
else:
# unknown type, we skip it
Expand Down
Loading