Migration commit from heyvince.co to heyvince.ca

This commit is contained in:
vgoineau
2024-05-03 09:24:48 -04:00
commit 2c898da04f
24 changed files with 679 additions and 0 deletions

1
dft/base/__init__.py Normal file
View File

@@ -0,0 +1 @@
from . import mixin

17
dft/base/config.py Normal file
View File

@@ -0,0 +1,17 @@
import yaml
def get_config_from_yml(filepath="dft_project.yml"):
try:
with open(filepath, 'r') as file:
data = yaml.safe_load(file)
except Exception as e:
return e
else:
return data
def load_config():
data = get_config_from_yml()
return data
config = load_config()

50
dft/base/connector.py Normal file
View File

@@ -0,0 +1,50 @@
import yaml
import os
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from .config import config as cfg
import pandas as pd
class SQLConnector():
def __init__(self, src_name, filepath):
super().__init__(src_name, filepath)
def _get_source_url(self):
try:
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
data = yaml.safe_load(file)
target = data[cfg["profile"]]['target']
src_info = data[cfg["profile"]][target]["extract_from"][self.name]
src_url = URL.create(
src_info["connector"],
username=src_info["user"],
password=src_info["password"],
host=src_info["host"],
port=src_info["port"],
database=src_info["database"],
query={
"driver": src_info["driver"],
},
)
except Exception as e:
return e
else:
return src_url
def _create_engine(self):
try:
engine = create_engine(self._get_source_url())
except Exception as e:
return e
else:
return engine
def read_query(self, query) -> pd.DataFrame:
try:
df = pd.read_sql(query, self._create_engine())
except Exception as e:
return e
else:
return df

103
dft/base/files.py Normal file
View File

@@ -0,0 +1,103 @@
from prefect import task
import os
import re
from jinja2 import Environment, PackageLoader
from .config import config as cfg
def _list_to_dict(input_list):
result_dict = {}
for item in input_list:
if '=' in item:
# Split the item on the first '=' and strip spaces and double quotes
key, value = item.split('=', 1)
key = key.strip()
value = value.strip().strip('"')
result_dict[key] = value
return result_dict
def delete_file(directory, donot=[]):
for root, dirs, files in os.walk(directory, topdown=False):
for name in files:
file_path = os.path.join(root, name)
if name not in donot:
try:
os.unlink(file_path)
print(f"Removing file: {file_path}")
except Exception as e:
print(f"Failed to remove {file_path}. Reason: {e}")
def generate_init(query_paths):
def get_declaration(query_paths):
model_names = []
for filepath in query_paths:
bn, filename, filetype = get_filename_from_path(filepath)
model_names.append(f'from .{filename} import {filename}')
return model_names
def render_init(declarations):
try:
env = Environment(loader=PackageLoader('dft', 'templates'))
template = env.get_template('model_init.jinja2')
# Render the template with variables
rendered_content = template.render(
model_names=declarations
)
except Exception as e:
return e
else:
print("Init is rendered")
return rendered_content
lst_model = get_declaration(query_paths)
content = render_init(lst_model)
try:
with open(f'{cfg["migration-path"]}/models/__init__.py', 'w') as f:
f.write(content)
except Exception as e:
return e
else:
print(f"Init file is updated")
return True
def get_filename_from_path(filepath):
try:
bn = os.path.basename(filepath)
filename = os.path.splitext(bn)
except Exception as e:
return e
else:
return bn, filename[0], filename[1]
def get_filepaths(directories):
lst_filepaths = []
# Walk through all directories and files in the given directory
for dir in directories:
for root, dirs, files in os.walk(dir):
for file in files:
# Create full path by joining root with file name
full_path = os.path.join(root, file)
lst_filepaths.append(full_path)
return lst_filepaths
def process_query_file(filepath):
with open(filepath, 'r') as file:
content = file.read()
# Extract key-value pairs from within '{{ }}'
match = re.search(r"{{\s*config\s*=\s*(\[[^\]]+\])\s*}}", content)
if match:
# Try to parse the extracted content into a Python dictionary
config_str = match.group(1)
config_str = re.sub(r'\s+', ' ', config_str).strip()
config_str = config_str.strip('[]')
config_lst = config_str.split(',')
config = _list_to_dict(config_lst)
else:
config = None
content = re.sub(r"{{.*?}}", "", content, flags=re.DOTALL)
content = re.sub(r'\s+', ' ', content).strip()
return config, content

19
dft/base/loader.py Normal file
View File

@@ -0,0 +1,19 @@
from .config import config as cfg
import re
class SQLLoader():
def __init__(self, src_name, filepath) -> None:
super().__init__(src_name, filepath)
def _get_initial_load_query(self, query_name, query):
match = re.search(r'with ' + query_name + ' as \((.*?)\)', query, re.DOTALL)
if match:
initial_load_query = match.group(1).strip()
return initial_load_query
else:
return None
def init_load(self):
self._is_table_empty()
self._get_initial_load_query(query_name, query)

24
dft/base/migrator.py Normal file
View File

@@ -0,0 +1,24 @@
from prefect import task
from alembic.config import Config
from alembic import command
from .config import config as cfg
import yaml
import os
def get_target_url():
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
data = yaml.safe_load(file)
target = data[cfg["profile"]]['target']
conn_info = data[cfg["profile"]][target]["load_to"][cfg["target-name"]]
connection_url = f'mssql+pyodbc://{conn_info["user"]}:{conn_info["password"]}@{conn_info["host"]}:{conn_info["port"]}/{conn_info["database"]}?driver={conn_info["driver"].replace(" ", "+")}'
return connection_url
def create_migration_file(script_message):
alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
command.revision(alembic_cfg, autogenerate=True, message=script_message)
def migrate():
alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
command.upgrade(alembic_cfg, "head")

9
dft/base/mixin.py Normal file
View File

@@ -0,0 +1,9 @@
from sqlalchemy import Column, Integer
from sqlalchemy.orm import sessionmaker
import pandas as pd
class DFTMixin():
#TODO: add repeatable field for slow dim etc.
Pk = Column("Pk", Integer, primary_key=True)
batch_id = Column("BatchID", Integer, nullable=True)

91
dft/base/modeler.py Normal file
View File

@@ -0,0 +1,91 @@
from . import files
from jinja2 import Environment, PackageLoader
from sqlalchemy import text
from datetime import datetime, date, time
from decimal import Decimal
from .config import config as cfg
from . import files
class Modeler():
def __init__(self, src_name, filepath):
self.name = src_name
self.filepath = filepath
self.bn, self.filename, self.filetype = files.get_filename_from_path(self.filepath)
def infer_type_from_value(self, value):
if isinstance(value, int):
return 'Integer' # Maps to INTEGER in SQL
elif isinstance(value, str):
return 'String' # Maps to VARCHAR in SQL
elif isinstance(value, float):
return 'Float' # Maps to FLOAT in SQL
elif isinstance(value, Decimal):
# Infer precision and scale for decimal values
return f'Numeric(12, 3)' # Maps to NUMERIC in SQL
elif isinstance(value, datetime):
return 'DateTime' # Maps to DATETIME in SQL
elif isinstance(value, date):
return 'Date' # Maps to DATE in SQL
elif isinstance(value, time):
return 'Time' # Maps to TIME in SQL
elif isinstance(value, bool):
return 'Boolean' # Maps to BOOLEAN in SQL
elif isinstance(value, bytes):
return 'LargeBinary' # Maps to BLOB or BYTEA in SQL
else:
return 'String' # Default fall-back type
def render_model(self, columns):
version = datetime.now().strftime("%Y%m%d%H%M%S")
try:
env = Environment(loader=PackageLoader('dft', 'templates'))
template = env.get_template('model_declaration.jinja2')
rendered_content = template.render(
from_file = f"{(version)}_{str(self.filepath)}",
migration_path=cfg["migration-path"],
model_name=self.filename,
revision_id=version,
create_date=datetime.now().isoformat(),
columns=columns
)
except Exception as e:
return e
else:
print(f"Model {self.filename} content is rendered")
return rendered_content
def write_model_to_file(self, content):
to_file = cfg["migration-path"] + f"/models/{self.filename}.py"
try:
# Write the rendered content to a file
with open(to_file, 'w') as f:
f.write(content)
except Exception as e:
return e
else:
print(f"Model {self.filename} has been written to file {to_file} with success !")
return True
class SQLModeler(Modeler):
def __init__(self, src_name, filepath):
super().__init__(src_name, filepath)
def infer_model_columns(self, result):
first_row = result.fetchone()
columns = []
for column, value in zip(result.keys(), first_row):
sql_type = self.infer_type_from_value(value)
column = f'{column} = Column("{column}", {sql_type})'
columns.append(column)
return columns
def generate_model(self, query):
with self._create_engine().connect() as cnt:
rst = cnt.execute(text(query))
columns = self.infer_model_columns(rst)
content = self.render_model(columns)
self.write_model_to_file(content)

30
dft/base/sources.py Normal file
View File

@@ -0,0 +1,30 @@
import yaml
import os
from .connector import SQLConnector
from .modeler import SQLModeler
from .loader import SQLLoader
from .config import config as cfg
import pandas as pd
def get_source_type(cnt_name, action="extract_from"):
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
data = yaml.safe_load(file)
target = data[cfg["profile"]]['target']
return data[cfg["profile"]][target][action][cnt_name]["type"]
def get_source(src_name, filepath, action="extract_from"):
src_type = get_source_type(src_name, action)
#TODO: SelectConnector depending on other type
if src_type == "sql":
# SQLSource
return SQLSource(src_name, filepath)
else:
return "Source type not supported for now."
class SQLSource(SQLModeler, SQLConnector, SQLLoader):
type = "sql"
def __init__(self, src_name, filepath) -> None:
super().__init__(src_name, filepath)