Migration commit from heyvince.co to heyvince.ca

2024-05-03 09:24:48 -04:00 · 2024-05-03 09:24:48 -04:00 · 2c898da04f
commit 2c898da04f
24 changed files with 679 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+__pycache__/
+*.egg-info
--- a/README.md
+++ b/README.md
--- a/cli.py
+++ b/cli.py
@ -0,0 +1,19 @@
+import click
+from dft.commands.migrate import createmigration, migrate
+from dft.commands.load import load
+
+
+@click.group()
+def cli():
+    pass
+
+#main.add_command(init)
+#cli.add_command(debug)
+cli.add_command(createmigration)
+cli.add_command(migrate)
+cli.add_command(load)
+#main.add_command(deploy)
+
+
+if __name__ == "__main__":
+    cli()
--- a/dft/init.py
+++ b/dft/init.py
@ -0,0 +1 @@
+from . import base
--- a/dft/base/init.py
+++ b/dft/base/init.py
@ -0,0 +1 @@
+from . import mixin
--- a/dft/base/config.py
+++ b/dft/base/config.py
@ -0,0 +1,17 @@
+import yaml
+
+
+def get_config_from_yml(filepath="dft_project.yml"):
+    try:
+        with open(filepath, 'r') as file:
+            data = yaml.safe_load(file)
+    except Exception as e:
+        return e
+    else:
+        return data
+
+def load_config():
+    data = get_config_from_yml()
+    return data
+
+config = load_config()
--- a/dft/base/connector.py
+++ b/dft/base/connector.py
@ -0,0 +1,50 @@
+import yaml
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.engine import URL
+from .config import config as cfg
+import pandas as pd
+
+
+class SQLConnector():
+
+    def __init__(self, src_name, filepath):
+        super().__init__(src_name, filepath)
+
+    def _get_source_url(self):
+        try:
+            with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:                
+                data = yaml.safe_load(file)
+                target = data[cfg["profile"]]['target']
+                src_info = data[cfg["profile"]][target]["extract_from"][self.name]
+                src_url = URL.create(
+                    src_info["connector"],
+                    username=src_info["user"],
+                    password=src_info["password"],
+                    host=src_info["host"],
+                    port=src_info["port"],
+                    database=src_info["database"],
+                    query={
+                        "driver": src_info["driver"],
+                    },
+                )
+        except Exception as e:
+            return e
+        else:
+            return src_url
+    
+    def _create_engine(self):
+        try:
+            engine = create_engine(self._get_source_url())
+        except Exception as e:
+            return e
+        else:
+            return engine
+    
+    def read_query(self, query) -> pd.DataFrame:
+        try:
+            df = pd.read_sql(query, self._create_engine())
+        except Exception as e:
+            return e
+        else:
+            return df
--- a/dft/base/files.py
+++ b/dft/base/files.py
@ -0,0 +1,103 @@
+from prefect import task
+import os
+import re
+from jinja2 import Environment, PackageLoader
+from .config import config as cfg
+
+
+def _list_to_dict(input_list):
+    result_dict = {}
+    for item in input_list:
+        if '=' in item:
+            # Split the item on the first '=' and strip spaces and double quotes
+            key, value = item.split('=', 1)
+            key = key.strip()
+            value = value.strip().strip('"')
+            result_dict[key] = value
+    return result_dict
+
+def delete_file(directory, donot=[]):
+    for root, dirs, files in os.walk(directory, topdown=False):
+        for name in files:
+            file_path = os.path.join(root, name)
+            if name not in donot:
+                try:            
+                    os.unlink(file_path)
+                    print(f"Removing file: {file_path}")
+                except Exception as e:
+                    print(f"Failed to remove {file_path}. Reason: {e}")
+
+def generate_init(query_paths):
+    def get_declaration(query_paths):
+        model_names = []
+        for filepath in query_paths:
+            bn, filename, filetype = get_filename_from_path(filepath)
+            model_names.append(f'from .{filename} import {filename}')
+        return model_names
+    
+    def render_init(declarations):
+        try:
+            env = Environment(loader=PackageLoader('dft', 'templates'))
+            template = env.get_template('model_init.jinja2')
+
+            # Render the template with variables
+            rendered_content = template.render(
+                model_names=declarations
+            )
+        except Exception as e:
+            return e
+        else:
+            print("Init is rendered")
+            return rendered_content
+    
+    lst_model = get_declaration(query_paths)
+    content = render_init(lst_model)
+    try:      
+        with open(f'{cfg["migration-path"]}/models/__init__.py', 'w') as f:
+            f.write(content)
+    except Exception as e:
+        return e
+    else:
+        print(f"Init file is updated")
+        return True
+
+def get_filename_from_path(filepath):
+    try:
+        bn = os.path.basename(filepath)
+        filename = os.path.splitext(bn)
+    except Exception as e:
+        return e
+    else:
+        return bn, filename[0], filename[1]
+
+def get_filepaths(directories):
+    lst_filepaths = []
+    # Walk through all directories and files in the given directory
+    for dir in directories:
+        for root, dirs, files in os.walk(dir):
+            for file in files:
+                # Create full path by joining root with file name
+                full_path = os.path.join(root, file)
+                lst_filepaths.append(full_path)
+    return lst_filepaths
+
+def process_query_file(filepath):
+    with open(filepath, 'r') as file:
+        content = file.read()
+
+    # Extract key-value pairs from within '{{ }}'
+    match = re.search(r"{{\s*config\s*=\s*(\[[^\]]+\])\s*}}", content)
+    if match:
+        # Try to parse the extracted content into a Python dictionary
+        config_str = match.group(1)
+        config_str = re.sub(r'\s+', ' ', config_str).strip()
+        config_str = config_str.strip('[]')
+        config_lst = config_str.split(',')
+        config = _list_to_dict(config_lst)
+    else:
+        config = None
+
+    content = re.sub(r"{{.*?}}", "", content, flags=re.DOTALL)
+    content = re.sub(r'\s+', ' ', content).strip()
+
+    return config, content
--- a/dft/base/loader.py
+++ b/dft/base/loader.py
@ -0,0 +1,19 @@
+from .config import config as cfg
+import re
+
+class SQLLoader():
+
+    def __init__(self, src_name, filepath) -> None:
+        super().__init__(src_name, filepath)
+
+    def _get_initial_load_query(self, query_name, query):
+        match = re.search(r'with ' + query_name + ' as \((.*?)\)', query, re.DOTALL)
+        if match:
+            initial_load_query = match.group(1).strip()
+            return initial_load_query
+        else:
+            return None
+
+    def init_load(self):
+        self._is_table_empty()
+        self._get_initial_load_query(query_name, query)
--- a/dft/base/migrator.py
+++ b/dft/base/migrator.py
@ -0,0 +1,24 @@
+from prefect import task
+from alembic.config import Config
+from alembic import command
+from .config import config as cfg
+import yaml
+import os
+
+def get_target_url():
+    with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
+        data = yaml.safe_load(file)
+        target = data[cfg["profile"]]['target']
+        conn_info = data[cfg["profile"]][target]["load_to"][cfg["target-name"]]
+        connection_url = f'mssql+pyodbc://{conn_info["user"]}:{conn_info["password"]}@{conn_info["host"]}:{conn_info["port"]}/{conn_info["database"]}?driver={conn_info["driver"].replace(" ", "+")}'
+    return connection_url
+
+def create_migration_file(script_message):
+    alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
+    alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
+    command.revision(alembic_cfg, autogenerate=True, message=script_message)
+
+def migrate():
+    alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
+    alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
+    command.upgrade(alembic_cfg, "head")
--- a/dft/base/mixin.py
+++ b/dft/base/mixin.py
@ -0,0 +1,9 @@
+from sqlalchemy import Column, Integer
+from sqlalchemy.orm import sessionmaker
+import pandas as pd
+
+
+class DFTMixin():
+    #TODO: add repeatable field for slow dim etc. 
+    Pk = Column("Pk", Integer, primary_key=True)
+    batch_id = Column("BatchID", Integer, nullable=True)
--- a/dft/base/modeler.py
+++ b/dft/base/modeler.py
@ -0,0 +1,91 @@
+from . import files
+from jinja2 import Environment, PackageLoader
+from sqlalchemy import text
+from datetime import datetime, date, time
+from decimal import Decimal
+from .config import config as cfg
+from . import files
+
+class Modeler():
+    def __init__(self, src_name, filepath):
+        self.name = src_name
+        self.filepath = filepath
+        self.bn, self.filename, self.filetype = files.get_filename_from_path(self.filepath)
+
+    def infer_type_from_value(self, value):
+        if isinstance(value, int):
+            return 'Integer'  # Maps to INTEGER in SQL
+        elif isinstance(value, str):
+            return 'String'  # Maps to VARCHAR in SQL
+        elif isinstance(value, float):
+            return 'Float'  # Maps to FLOAT in SQL
+        elif isinstance(value, Decimal):
+            # Infer precision and scale for decimal values
+            return f'Numeric(12, 3)'  # Maps to NUMERIC in SQL
+        elif isinstance(value, datetime):
+            return 'DateTime'  # Maps to DATETIME in SQL
+        elif isinstance(value, date):
+            return 'Date'  # Maps to DATE in SQL
+        elif isinstance(value, time):
+            return 'Time'  # Maps to TIME in SQL
+        elif isinstance(value, bool):
+            return 'Boolean'  # Maps to BOOLEAN in SQL
+        elif isinstance(value, bytes):
+            return 'LargeBinary'  # Maps to BLOB or BYTEA in SQL
+        else:
+            return 'String'  # Default fall-back type
+    
+    def render_model(self, columns):
+        version = datetime.now().strftime("%Y%m%d%H%M%S")
+        
+        try:
+            env = Environment(loader=PackageLoader('dft', 'templates'))
+            template = env.get_template('model_declaration.jinja2')
+
+            rendered_content = template.render(
+                from_file = f"{(version)}_{str(self.filepath)}",
+                migration_path=cfg["migration-path"],
+                model_name=self.filename,
+                revision_id=version,
+                create_date=datetime.now().isoformat(),
+                columns=columns
+            )
+        except Exception as e:
+            return e
+        else:
+            print(f"Model {self.filename} content is rendered")
+            return rendered_content
+    
+    def write_model_to_file(self, content):
+        to_file = cfg["migration-path"] + f"/models/{self.filename}.py"
+        try:
+            # Write the rendered content to a file
+            with open(to_file, 'w') as f:
+                f.write(content)
+        except Exception as e:
+            return e
+        else:
+            print(f"Model {self.filename} has been written to file {to_file} with success !")
+            return True
+
+
+class SQLModeler(Modeler):
+
+    def __init__(self, src_name, filepath):
+        super().__init__(src_name, filepath)
+
+    def infer_model_columns(self, result):
+        first_row = result.fetchone()
+        columns = []
+        for column, value in zip(result.keys(), first_row):
+            sql_type = self.infer_type_from_value(value)
+            column = f'{column} = Column("{column}", {sql_type})'
+            columns.append(column)
+        return columns
+    
+    def generate_model(self, query):
+        with self._create_engine().connect() as cnt:
+            rst = cnt.execute(text(query))
+            columns = self.infer_model_columns(rst)
+            content = self.render_model(columns)
+            self.write_model_to_file(content)
--- a/dft/base/sources.py
+++ b/dft/base/sources.py
@ -0,0 +1,30 @@
+import yaml
+import os
+from .connector import SQLConnector
+from .modeler import SQLModeler
+from .loader import SQLLoader
+from .config import config as cfg
+import pandas as pd
+
+
+def get_source_type(cnt_name, action="extract_from"):
+    with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
+        data = yaml.safe_load(file)
+        target = data[cfg["profile"]]['target']
+    return data[cfg["profile"]][target][action][cnt_name]["type"]
+
+def get_source(src_name, filepath, action="extract_from"):
+    src_type = get_source_type(src_name, action)
+    #TODO: SelectConnector depending on other type
+    if src_type == "sql":
+        # SQLSource
+        return SQLSource(src_name, filepath)
+    else:
+        return "Source type not supported for now."
+
+
+class SQLSource(SQLModeler, SQLConnector, SQLLoader):
+    type = "sql"
+
+    def __init__(self, src_name, filepath) -> None:
+        super().__init__(src_name, filepath)
--- a/dft/commands/init.py
+++ b/dft/commands/init.py
--- a/dft/commands/deploy.py
+++ b/dft/commands/deploy.py
@ -0,0 +1,7 @@
+import click
+
+@click.command()
+@click.option('-c', default="dft_project.yml", help='Build your flow with scheduler and deploy them to your server.')
+def deploy(config):
+    click.echo("Deploying your query files to server...")
+
--- a/dft/commands/init.py
+++ b/dft/commands/init.py
@ -0,0 +1,16 @@
+import click
+
+@click.command()
+def init():
+    click.echo("Initializing your dft environment...")
+    # Create folder .dft inside home
+    # Add profile.yml file inside .dft folder
+    # Init Alembic with folder migrations
+    # Move alembic.ini
+    # Create models folder inside migrations
+    # Ajouter migration/models/ base.py
+    # Create files inside folder models
+    # Generate first init.py with dftbatch + base
+    # Add to env.py from migration import models + models.Base.metadata
+    # Create folder query
+    # Add file with dft_projects.yml
--- a/dft/commands/load.py
+++ b/dft/commands/load.py
@ -0,0 +1,35 @@
+import click
+from ..base import config, files, sources
+from ..base.config import config as cfg
+from sqlalchemy import create_engine, Table, MetaData
+from sqlalchemy.sql import select, insert
+import pandas as pd
+from datetime import datetime
+
+@click.command()
+def load():
+    click.echo("Loading data from your query to your target db...")
+    #TODO: Validate init is done
+
+    # Iterate over query-paths
+    query_paths = files.get_filepaths(cfg["query-paths"])
+    for filepath in query_paths:
+        conf, query = files.process_query_file(filepath)
+
+        import pdb; pdb.set_trace()
+        
+        # Do insert data to target db
+        src = sources.get_source(conf["extract_from"], filepath)
+        rst_src = src.read_query(query)
+
+        cnt_to = sources.get_source(
+            cnt_name=cfg["target-name"],
+            filepath=filepath
+        )
+
+        # add batch_id to df
+        #df["BatchID"] = 20100
+        # Manipulate the DataFrame if necessary (e.g., filtering, adding new columns)
+        # Example: df['new_column'] = df['column1'] * 2
+
+        #df.to_sql(cnt_to.filename, con=cnt_to.engine, if_exists='append', index=False)
--- a/dft/commands/migrate.py
+++ b/dft/commands/migrate.py
@ -0,0 +1,32 @@
+import click
+from ..base import files, sources
+from ..base.config import config as cfg
+from ..base import migrator as mig
+from alembic.config import Config
+
+
+@click.command()
+@click.option('-m', default="", help='Description of your migration')
+def createmigration(m):
+    #TODO: Validate init is done
+    click.echo("Generating migration files... but first lets remove your models")
+    
+    # Delete files from model folder
+    files.delete_file(cfg["migration-path"]+"/models", donot=["base.py"])
+
+    # Iterate over query-paths
+    query_paths = files.get_filepaths(cfg["query-paths"])
+    for filepath in query_paths:
+        conf, query = files.process_query_file(filepath)
+        src = sources.get_source(conf["extract_from"],filepath)
+        src.generate_model(query)
+    
+    # Generate Init file
+    files.generate_init(query_paths)
+    mig.create_migration_file(script_message=str(m))
+    click.echo("Creation migration is done !")
+
+
+@click.command()
+def migrate():
+    mig.migrate()
--- a/dft/templates/connections.jinja2
+++ b/dft/templates/connections.jinja2
@ -0,0 +1,32 @@
+profile-name:
+    target: dev
+    dev:
+        extract_from:
+            sql_server:
+                type: sql
+                connector: mssql+pyodbc
+                driver: 'ODBC Driver 17 for SQL Server'
+                host: localhost
+                port: 1433
+                database: dabase_name
+                schema: dbo
+                user: user_read
+                password: yourpassword
+                encrypt: false
+#            source2:
+#                connector:test
+        load_to:
+            sql_server:
+                type: sql
+                connector: mssql+pyodbc
+                driver: 'ODBC Driver 17 for SQL Server'
+                host: localhost
+                port: 1433
+                database: dabase_name
+                schema: dbo
+                user: user_write
+                password: password
+                encrypt: false
+#    prod:
+#        extract_from:
+#        load_to:
--- a/dft/templates/dft_project.jinja2
+++ b/dft/templates/dft_project.jinja2
@ -0,0 +1,16 @@
+# Name your project! Project names should contain only lowercase characters
+name: 'waffle_shop'
+version: '1.0.0'
+config-version: 2
+
+# Just like dft, this setting configures which "profile" dft uses for this project.
+# Do not forget to configure your connection.yml file in ~/.dft folder
+profile: 'profile-name'
+target-name: 'target_connection_name' # Change Target file in .dft
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+test-paths: ["tests"]
+query-paths: ["queries"]
+migration-path: "migration"
--- a/dft/templates/model_declaration.jinja2
+++ b/dft/templates/model_declaration.jinja2
@ -0,0 +1,15 @@
+from sqlalchemy import Column, Integer, String, Float, DateTime, Numeric, Date, Time, Boolean, LargeBinary
+from dft.base.mixin import DFTMixin
+from .base import Base
+
+
+####
+# Model generated by DFT 
+# - From : {{ from_file }}
+# - Version : {{ revision_id }}
+# - Created Date : {{ create_date }}
+####
+class {{ model_name }}(Base, DFTMixin):
+    __tablename__ = '{{ model_name }}'
+    {% for item in columns %}
+    {{ item }} {% endfor %}
--- a/dft/templates/model_init.jinja2
+++ b/dft/templates/model_init.jinja2
@ -0,0 +1,3 @@
+from .base import Base
+{% for item in model_names %}
+{{ item }}{% endfor %}
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,131 @@
+agate==1.7.1
+aiosqlite==0.20.0
+alembic==1.13.1
+annotated-types==0.6.0
+anyio==3.7.1
+apprise==1.7.5
+asgi-lifespan==2.1.0
+async-timeout==4.0.3
+asyncpg==0.29.0
+attrs==23.2.0
+azure-core==1.30.1
+azure-identity==1.16.0
+Babel==2.14.0
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+colorama==0.4.6
+coolname==2.2.0
+croniter==2.0.3
+cryptography==42.0.5
+dateparser==1.2.0
+dbt-core==1.7.11
+dbt-extractor==0.5.1
+dbt-fabric==1.7.4
+dbt-semantic-interfaces==0.4.4
+dbt-sqlserver==1.7.4
+Deprecated==1.2.14
+dnspython==2.6.1
+docker==6.1.3
+email_validator==2.1.1
+exceptiongroup==1.2.0
+fsspec==2024.3.1
+google-auth==2.29.0
+graphviz==0.20.3
+greenlet==3.0.3
+griffe==0.42.1
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==1.0.5
+httpx==0.27.0
+humanize==4.9.0
+hyperframe==6.0.1
+idna==3.7
+importlib-metadata==6.11.0
+importlib_resources==6.1.3
+isodate==0.6.1
+itsdangerous==2.1.2
+Jinja2==3.1.3
+jinja2-humanize-extension==0.4.0
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kubernetes==29.0.0
+leather==0.4.0
+Logbook==1.5.3
+Mako==1.3.3
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mashumaro==3.12
+mdurl==0.1.2
+minimal-snowplow-tracker==0.0.2
+more-itertools==10.2.0
+msal==1.28.0
+msal-extensions==1.1.0
+msgpack==1.0.8
+networkx==3.3
+numpy==1.26.4
+oauthlib==3.2.2
+opentelemetry-api==1.24.0
+orjson==3.10.0
+packaging==24.0
+pandas==2.2.2
+parsedatetime==2.6
+pathspec==0.11.2
+pendulum==2.1.2
+portalocker==2.8.2
+prefect==2.16.9
+protobuf==4.25.3
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pycparser==2.22
+pydantic==2.7.0
+pydantic_core==2.18.1
+Pygments==2.17.2
+PyJWT==2.8.0
+pyodbc==5.0.1
+pyproject_hooks==1.0.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+python-slugify==8.0.4
+pytimeparse==1.1.8
+pytz==2024.1
+pytzdata==2020.1
+PyYAML==6.0.1
+readchar==4.0.6
+referencing==0.34.0
+regex==2023.12.25
+requests==2.31.0
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rich==13.7.1
+rpds-py==0.18.0
+rsa==4.9
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.29
+sqlparse==0.4.4
+text-unidecode==1.3
+toml==0.10.2
+tomli==2.0.1
+typer==0.12.3
+typing_extensions==4.11.0
+tzdata==2024.1
+tzlocal==5.2
+ujson==5.9.0
+urllib3==1.26.18
+uvicorn==0.28.1
+websocket-client==1.7.0
+websockets==12.0
+wrapt==1.16.0
+zipp==3.18.1
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='dft',
+    version='0.1.0',
+    packages=find_packages(),
+    include_package_data=True,
+    description='Data flow tools',
+    long_description='Use dft to do extraction and load in your data pipeline',
+    author='Vincent Goineau',
+    author_email='vgoineau@ctocollective.com',
+    package_data={
+        'dft': ['templates/*.jinja'],
+    },
+    install_requires=[
+        'click',
+        'sqlalchemy',
+        'alembic',
+        'jinja2',
+    ],
+    entry_points={
+        'console_scripts': [
+            'dft = cli:cli',
+        ],
+    },
+)