airflow DAG/PIPELINE examples reference

lightsong發表於2024-08-06

data-pipelines-with-apache-airflow

https://github.com/BasPH/data-pipelines-with-apache-airflow

Code for Data Pipelines with Apache Airflow

https://www.manning.com/books/data-pipelines-with-apache-airflow

A successful pipeline moves data efficiently, minimizing pauses and blockages between tasks, keeping every process along the way operational. Apache Airflow provides a single customizable environment for building and managing data pipelines, eliminating the need for a hodgepodge collection of tools, snowflake code, and homegrown processes. Using real-world scenarios and examples, Data Pipelines with Apache Airflow teaches you how to simplify and automate data pipelines, reduce operational overhead, and smoothly integrate all the technologies in your stack.

抓取網頁儲存

https://github.com/BasPH/data-pipelines-with-apache-airflow/blob/master/chapter14/dags/nyc_dag.py

import io
import json

import airflow.utils.dates
import geopandas
import pandas as pd
import requests
from airflow.hooks.base import BaseHook
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from minio import Minio
from nyctransport.operators.pandas_operator import PandasOperator
from nyctransport.operators.s3_to_postgres import MinioPandasToPostgres
from requests.auth import HTTPBasicAuth

dag = DAG(
    dag_id="nyc_dag",
    schedule_interval="*/15 * * * *",
    start_date=airflow.utils.dates.days_ago(1),
    catchup=False,
)


def _download_citi_bike_data(ts_nodash, **_):
    citibike_conn = BaseHook.get_connection(conn_id="citibike")

    url = f"http://{citibike_conn.host}:{citibike_conn.port}/recent/minute/15"
    response = requests.get(
        url, auth=HTTPBasicAuth(citibike_conn.login, citibike_conn.password)
    )
    data = response.json()

    s3_hook = S3Hook(aws_conn_id="s3")
    s3_hook.load_string(
        string_data=json.dumps(data),
        key=f"raw/citibike/{ts_nodash}.json",
        bucket_name="datalake",
    )


download_citi_bike_data = PythonOperator(
    task_id="download_citi_bike_data", python_callable=_download_citi_bike_data, dag=dag
)


def _download_taxi_data():
    taxi_conn = BaseHook.get_connection(conn_id="taxi")
    s3_hook = S3Hook(aws_conn_id="s3")

    url = f"http://{taxi_conn.host}"
    response = requests.get(url)
    files = response.json()

    exported_files = []
    for filename in [f["name"] for f in files]:
        response = requests.get(f"{url}/{filename}")
        s3_key = f"raw/taxi/{filename}"
        try:
            s3_hook.load_string(
                string_data=response.text, key=s3_key, bucket_name="datalake"
            )
            print(f"Uploaded {s3_key} to MinIO.")
            exported_files.append(s3_key)
        except ValueError:
            print(f"File {s3_key} already exists.")

    return exported_files


download_taxi_data = PythonOperator(
    task_id="download_taxi_data", python_callable=_download_taxi_data, dag=dag
)


def get_minio_object(
    pandas_read_callable, bucket, paths, pandas_read_callable_kwargs=None
):
    s3_conn = BaseHook.get_connection(conn_id="s3")
    minio_client = Minio(
        s3_conn.extra_dejson["host"].split("://")[1],
        access_key=s3_conn.extra_dejson["aws_access_key_id"],
        secret_key=s3_conn.extra_dejson["aws_secret_access_key"],
        secure=False,
    )

    if isinstance(paths, str):
        if paths.startswith("[") and paths.endswith("]"):
            paths = eval(paths)
        else:
            paths = [paths]

    if pandas_read_callable_kwargs is None:
        pandas_read_callable_kwargs = {}

    dfs = []
    for path in paths:
        minio_object = minio_client.get_object(bucket_name=bucket, object_name=path)
        df = pandas_read_callable(minio_object, **pandas_read_callable_kwargs)
        dfs.append(df)
    return pd.concat(dfs)


def transform_citi_bike_data(df):
    # Map citi bike lat,lon coordinates to taxi zone ids
    taxi_zones = geopandas.read_file(
        "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
    ).to_crs("EPSG:4326")
    start_gdf = geopandas.GeoDataFrame(
        df,
        crs="EPSG:4326",
        geometry=geopandas.points_from_xy(
            df["start_station_longitude"], df["start_station_latitude"]
        ),
    )
    end_gdf = geopandas.GeoDataFrame(
        df,
        crs="EPSG:4326",
        geometry=geopandas.points_from_xy(
            df["end_station_longitude"], df["end_station_latitude"]
        ),
    )
    df_with_zones = geopandas.sjoin(
        start_gdf, taxi_zones, how="left", op="within"
    ).rename(columns={"LocationID": "start_location_id"})
    end_zones = geopandas.sjoin(end_gdf, taxi_zones, how="left", op="within")
    df_with_zones["end_location_id"] = end_zones["LocationID"]
    return df_with_zones[
        [
            "tripduration",
            "starttime",
            "start_location_id",
            "stoptime",
            "end_location_id",
        ]
    ]


def write_minio_object(
    df, pandas_write_callable, bucket, path, pandas_write_callable_kwargs=None
):
    s3_conn = BaseHook.get_connection(conn_id="s3")
    minio_client = Minio(
        s3_conn.extra_dejson["host"].split("://")[1],
        access_key=s3_conn.extra_dejson["aws_access_key_id"],
        secret_key=s3_conn.extra_dejson["aws_secret_access_key"],
        secure=False,
    )
    bytes_buffer = io.BytesIO()
    pandas_write_method = getattr(df, pandas_write_callable.__name__)
    pandas_write_method(bytes_buffer, **pandas_write_callable_kwargs)
    nbytes = bytes_buffer.tell()
    bytes_buffer.seek(0)
    minio_client.put_object(
        bucket_name=bucket, object_name=path, length=nbytes, data=bytes_buffer
    )


process_citi_bike_data = PandasOperator(
    task_id="process_citi_bike_data",
    input_callable=get_minio_object,
    input_callable_kwargs={
        "pandas_read_callable": pd.read_json,
        "bucket": "datalake",
        "paths": "raw/citibike/{{ ts_nodash }}.json",
    },
    transform_callable=transform_citi_bike_data,
    output_callable=write_minio_object,
    output_callable_kwargs={
        "bucket": "datalake",
        "path": "processed/citibike/{{ ts_nodash }}.parquet",
        "pandas_write_callable": pd.DataFrame.to_parquet,
        "pandas_write_callable_kwargs": {"engine": "auto"},
    },
    dag=dag,
)


def transform_taxi_data(df):
    df[["pickup_datetime", "dropoff_datetime"]] = df[
        ["pickup_datetime", "dropoff_datetime"]
    ].apply(pd.to_datetime)
    df["tripduration"] = (
        (df["dropoff_datetime"] - df["pickup_datetime"]).dt.total_seconds().astype(int)
    )
    df = df.rename(
        columns={
            "pickup_datetime": "starttime",
            "pickup_locationid": "start_location_id",
            "dropoff_datetime": "stoptime",
            "dropoff_locationid": "end_location_id",
        }
    ).drop(columns=["trip_distance"])
    return df


process_taxi_data = PandasOperator(
    task_id="process_taxi_data",
    input_callable=get_minio_object,
    input_callable_kwargs={
        "pandas_read_callable": pd.read_csv,
        "bucket": "datalake",
        "paths": "{{ ti.xcom_pull(task_ids='download_taxi_data') }}",
    },
    transform_callable=transform_taxi_data,
    output_callable=write_minio_object,
    output_callable_kwargs={
        "bucket": "datalake",
        "path": "processed/taxi/{{ ts_nodash }}.parquet",
        "pandas_write_callable": pd.DataFrame.to_parquet,
        "pandas_write_callable_kwargs": {"engine": "auto"},
    },
    dag=dag,
)

taxi_to_db = MinioPandasToPostgres(
    task_id="taxi_to_db",
    dag=dag,
    minio_conn_id="s3",
    minio_bucket="datalake",
    minio_key="processed/taxi/{{ ts_nodash }}.parquet",
    pandas_read_callable=pd.read_parquet,
    postgres_conn_id="result_db",
    postgres_table="taxi_rides",
    pre_read_transform=lambda x: io.BytesIO(x.data),
)

citi_bike_to_db = MinioPandasToPostgres(
    task_id="citi_bike_to_db",
    dag=dag,
    minio_conn_id="s3",
    minio_bucket="datalake",
    minio_key="processed/citibike/{{ ts_nodash }}.parquet",
    pandas_read_callable=pd.read_parquet,
    postgres_conn_id="result_db",
    postgres_table="citi_bike_rides",
    pre_read_transform=lambda x: io.BytesIO(x.data),
)

download_citi_bike_data >> process_citi_bike_data >> citi_bike_to_db
download_taxi_data >> process_taxi_data >> taxi_to_db

awesome-apache-airflow

https://github.com/jghoman/awesome-apache-airflow

  • Vital links
  • Airflow deployment solutions
  • Introductions and tutorials
  • Airflow Summit 2020 Videos
  • Best practices, lessons learned and cool use cases
  • Books, blogs, podcasts, and such
  • Slide deck presentations and online videos
  • Libraries, Hooks, Utilities
  • Meetups
  • Commercial Airflow-as-a-service providers
  • Cloud Composer resources
  • Non-English resources

use-case-airflow-llm-rag-finance

抓網頁,解析文件, 存到向量資料庫。

https://github.com/fanqingsong/use-case-airflow-llm-rag-finance

LLMOps: Automatic retrieval-augmented generation with Airflow, GPT-4 and Weaviate

This repository contains the DAG code used in the LLMOps: Automatic retrieval-augmented generation with Airflow, GPT-4 and Weaviate use case. The pipeline was modelled after the Ask Astro reference architecture.

The DAGs in this repository use the following tools:

  • Weaviate Airflow provider
  • Streamlit
  • Weaviate
  • FinBERT
  • OpenAI GPT-4

相關文章