Migration commit from heyvince.co to heyvince.ca

2024-05-03 09:24:48 -04:00
commit 2c898da04f
24 changed files with 679 additions and 0 deletions
--- a/dft/base/init.py
+++ b/dft/base/init.py
@@ -0,0 +1 @@
+from . import mixin
--- a/dft/base/config.py
+++ b/dft/base/config.py
@@ -0,0 +1,17 @@
+import yaml
+
+
+def get_config_from_yml(filepath="dft_project.yml"):
+    try:
+        with open(filepath, 'r') as file:
+            data = yaml.safe_load(file)
+    except Exception as e:
+        return e
+    else:
+        return data
+
+def load_config():
+    data = get_config_from_yml()
+    return data
+
+config = load_config()
--- a/dft/base/connector.py
+++ b/dft/base/connector.py
@@ -0,0 +1,50 @@
+import yaml
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.engine import URL
+from .config import config as cfg
+import pandas as pd
+
+
+class SQLConnector():
+
+    def __init__(self, src_name, filepath):
+        super().__init__(src_name, filepath)
+
+    def _get_source_url(self):
+        try:
+            with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:                
+                data = yaml.safe_load(file)
+                target = data[cfg["profile"]]['target']
+                src_info = data[cfg["profile"]][target]["extract_from"][self.name]
+                src_url = URL.create(
+                    src_info["connector"],
+                    username=src_info["user"],
+                    password=src_info["password"],
+                    host=src_info["host"],
+                    port=src_info["port"],
+                    database=src_info["database"],
+                    query={
+                        "driver": src_info["driver"],
+                    },
+                )
+        except Exception as e:
+            return e
+        else:
+            return src_url
+    
+    def _create_engine(self):
+        try:
+            engine = create_engine(self._get_source_url())
+        except Exception as e:
+            return e
+        else:
+            return engine
+    
+    def read_query(self, query) -> pd.DataFrame:
+        try:
+            df = pd.read_sql(query, self._create_engine())
+        except Exception as e:
+            return e
+        else:
+            return df
--- a/dft/base/files.py
+++ b/dft/base/files.py
@@ -0,0 +1,103 @@
+from prefect import task
+import os
+import re
+from jinja2 import Environment, PackageLoader
+from .config import config as cfg
+
+
+def _list_to_dict(input_list):
+    result_dict = {}
+    for item in input_list:
+        if '=' in item:
+            # Split the item on the first '=' and strip spaces and double quotes
+            key, value = item.split('=', 1)
+            key = key.strip()
+            value = value.strip().strip('"')
+            result_dict[key] = value
+    return result_dict
+
+def delete_file(directory, donot=[]):
+    for root, dirs, files in os.walk(directory, topdown=False):
+        for name in files:
+            file_path = os.path.join(root, name)
+            if name not in donot:
+                try:            
+                    os.unlink(file_path)
+                    print(f"Removing file: {file_path}")
+                except Exception as e:
+                    print(f"Failed to remove {file_path}. Reason: {e}")
+
+def generate_init(query_paths):
+    def get_declaration(query_paths):
+        model_names = []
+        for filepath in query_paths:
+            bn, filename, filetype = get_filename_from_path(filepath)
+            model_names.append(f'from .{filename} import {filename}')
+        return model_names
+    
+    def render_init(declarations):
+        try:
+            env = Environment(loader=PackageLoader('dft', 'templates'))
+            template = env.get_template('model_init.jinja2')
+
+            # Render the template with variables
+            rendered_content = template.render(
+                model_names=declarations
+            )
+        except Exception as e:
+            return e
+        else:
+            print("Init is rendered")
+            return rendered_content
+    
+    lst_model = get_declaration(query_paths)
+    content = render_init(lst_model)
+    try:      
+        with open(f'{cfg["migration-path"]}/models/__init__.py', 'w') as f:
+            f.write(content)
+    except Exception as e:
+        return e
+    else:
+        print(f"Init file is updated")
+        return True
+
+def get_filename_from_path(filepath):
+    try:
+        bn = os.path.basename(filepath)
+        filename = os.path.splitext(bn)
+    except Exception as e:
+        return e
+    else:
+        return bn, filename[0], filename[1]
+
+def get_filepaths(directories):
+    lst_filepaths = []
+    # Walk through all directories and files in the given directory
+    for dir in directories:
+        for root, dirs, files in os.walk(dir):
+            for file in files:
+                # Create full path by joining root with file name
+                full_path = os.path.join(root, file)
+                lst_filepaths.append(full_path)
+    return lst_filepaths
+
+def process_query_file(filepath):
+    with open(filepath, 'r') as file:
+        content = file.read()
+
+    # Extract key-value pairs from within '{{ }}'
+    match = re.search(r"{{\s*config\s*=\s*(\[[^\]]+\])\s*}}", content)
+    if match:
+        # Try to parse the extracted content into a Python dictionary
+        config_str = match.group(1)
+        config_str = re.sub(r'\s+', ' ', config_str).strip()
+        config_str = config_str.strip('[]')
+        config_lst = config_str.split(',')
+        config = _list_to_dict(config_lst)
+    else:
+        config = None
+
+    content = re.sub(r"{{.*?}}", "", content, flags=re.DOTALL)
+    content = re.sub(r'\s+', ' ', content).strip()
+
+    return config, content
--- a/dft/base/loader.py
+++ b/dft/base/loader.py
@@ -0,0 +1,19 @@
+from .config import config as cfg
+import re
+
+class SQLLoader():
+
+    def __init__(self, src_name, filepath) -> None:
+        super().__init__(src_name, filepath)
+
+    def _get_initial_load_query(self, query_name, query):
+        match = re.search(r'with ' + query_name + ' as \((.*?)\)', query, re.DOTALL)
+        if match:
+            initial_load_query = match.group(1).strip()
+            return initial_load_query
+        else:
+            return None
+
+    def init_load(self):
+        self._is_table_empty()
+        self._get_initial_load_query(query_name, query)
--- a/dft/base/migrator.py
+++ b/dft/base/migrator.py
@@ -0,0 +1,24 @@
+from prefect import task
+from alembic.config import Config
+from alembic import command
+from .config import config as cfg
+import yaml
+import os
+
+def get_target_url():
+    with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
+        data = yaml.safe_load(file)
+        target = data[cfg["profile"]]['target']
+        conn_info = data[cfg["profile"]][target]["load_to"][cfg["target-name"]]
+        connection_url = f'mssql+pyodbc://{conn_info["user"]}:{conn_info["password"]}@{conn_info["host"]}:{conn_info["port"]}/{conn_info["database"]}?driver={conn_info["driver"].replace(" ", "+")}'
+    return connection_url
+
+def create_migration_file(script_message):
+    alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
+    alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
+    command.revision(alembic_cfg, autogenerate=True, message=script_message)
+
+def migrate():
+    alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
+    alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
+    command.upgrade(alembic_cfg, "head")
--- a/dft/base/mixin.py
+++ b/dft/base/mixin.py
@@ -0,0 +1,9 @@
+from sqlalchemy import Column, Integer
+from sqlalchemy.orm import sessionmaker
+import pandas as pd
+
+
+class DFTMixin():
+    #TODO: add repeatable field for slow dim etc. 
+    Pk = Column("Pk", Integer, primary_key=True)
+    batch_id = Column("BatchID", Integer, nullable=True)
--- a/dft/base/modeler.py
+++ b/dft/base/modeler.py
@@ -0,0 +1,91 @@
+from . import files
+from jinja2 import Environment, PackageLoader
+from sqlalchemy import text
+from datetime import datetime, date, time
+from decimal import Decimal
+from .config import config as cfg
+from . import files
+
+class Modeler():
+    def __init__(self, src_name, filepath):
+        self.name = src_name
+        self.filepath = filepath
+        self.bn, self.filename, self.filetype = files.get_filename_from_path(self.filepath)
+
+    def infer_type_from_value(self, value):
+        if isinstance(value, int):
+            return 'Integer'  # Maps to INTEGER in SQL
+        elif isinstance(value, str):
+            return 'String'  # Maps to VARCHAR in SQL
+        elif isinstance(value, float):
+            return 'Float'  # Maps to FLOAT in SQL
+        elif isinstance(value, Decimal):
+            # Infer precision and scale for decimal values
+            return f'Numeric(12, 3)'  # Maps to NUMERIC in SQL
+        elif isinstance(value, datetime):
+            return 'DateTime'  # Maps to DATETIME in SQL
+        elif isinstance(value, date):
+            return 'Date'  # Maps to DATE in SQL
+        elif isinstance(value, time):
+            return 'Time'  # Maps to TIME in SQL
+        elif isinstance(value, bool):
+            return 'Boolean'  # Maps to BOOLEAN in SQL
+        elif isinstance(value, bytes):
+            return 'LargeBinary'  # Maps to BLOB or BYTEA in SQL
+        else:
+            return 'String'  # Default fall-back type
+    
+    def render_model(self, columns):
+        version = datetime.now().strftime("%Y%m%d%H%M%S")
+        
+        try:
+            env = Environment(loader=PackageLoader('dft', 'templates'))
+            template = env.get_template('model_declaration.jinja2')
+
+            rendered_content = template.render(
+                from_file = f"{(version)}_{str(self.filepath)}",
+                migration_path=cfg["migration-path"],
+                model_name=self.filename,
+                revision_id=version,
+                create_date=datetime.now().isoformat(),
+                columns=columns
+            )
+        except Exception as e:
+            return e
+        else:
+            print(f"Model {self.filename} content is rendered")
+            return rendered_content
+    
+    def write_model_to_file(self, content):
+        to_file = cfg["migration-path"] + f"/models/{self.filename}.py"
+        try:
+            # Write the rendered content to a file
+            with open(to_file, 'w') as f:
+                f.write(content)
+        except Exception as e:
+            return e
+        else:
+            print(f"Model {self.filename} has been written to file {to_file} with success !")
+            return True
+
+
+class SQLModeler(Modeler):
+
+    def __init__(self, src_name, filepath):
+        super().__init__(src_name, filepath)
+
+    def infer_model_columns(self, result):
+        first_row = result.fetchone()
+        columns = []
+        for column, value in zip(result.keys(), first_row):
+            sql_type = self.infer_type_from_value(value)
+            column = f'{column} = Column("{column}", {sql_type})'
+            columns.append(column)
+        return columns
+    
+    def generate_model(self, query):
+        with self._create_engine().connect() as cnt:
+            rst = cnt.execute(text(query))
+            columns = self.infer_model_columns(rst)
+            content = self.render_model(columns)
+            self.write_model_to_file(content)
--- a/dft/base/sources.py
+++ b/dft/base/sources.py
@@ -0,0 +1,30 @@
+import yaml
+import os
+from .connector import SQLConnector
+from .modeler import SQLModeler
+from .loader import SQLLoader
+from .config import config as cfg
+import pandas as pd
+
+
+def get_source_type(cnt_name, action="extract_from"):
+    with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
+        data = yaml.safe_load(file)
+        target = data[cfg["profile"]]['target']
+    return data[cfg["profile"]][target][action][cnt_name]["type"]
+
+def get_source(src_name, filepath, action="extract_from"):
+    src_type = get_source_type(src_name, action)
+    #TODO: SelectConnector depending on other type
+    if src_type == "sql":
+        # SQLSource
+        return SQLSource(src_name, filepath)
+    else:
+        return "Source type not supported for now."
+
+
+class SQLSource(SQLModeler, SQLConnector, SQLLoader):
+    type = "sql"
+
+    def __init__(self, src_name, filepath) -> None:
+        super().__init__(src_name, filepath)