Migration commit from heyvince.co to heyvince.ca
This commit is contained in:
1
dft/base/__init__.py
Normal file
1
dft/base/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from . import mixin
|
||||
17
dft/base/config.py
Normal file
17
dft/base/config.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import yaml
|
||||
|
||||
|
||||
def get_config_from_yml(filepath="dft_project.yml"):
|
||||
try:
|
||||
with open(filepath, 'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
return data
|
||||
|
||||
def load_config():
|
||||
data = get_config_from_yml()
|
||||
return data
|
||||
|
||||
config = load_config()
|
||||
50
dft/base/connector.py
Normal file
50
dft/base/connector.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import yaml
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.engine import URL
|
||||
from .config import config as cfg
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class SQLConnector():
|
||||
|
||||
def __init__(self, src_name, filepath):
|
||||
super().__init__(src_name, filepath)
|
||||
|
||||
def _get_source_url(self):
|
||||
try:
|
||||
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
target = data[cfg["profile"]]['target']
|
||||
src_info = data[cfg["profile"]][target]["extract_from"][self.name]
|
||||
src_url = URL.create(
|
||||
src_info["connector"],
|
||||
username=src_info["user"],
|
||||
password=src_info["password"],
|
||||
host=src_info["host"],
|
||||
port=src_info["port"],
|
||||
database=src_info["database"],
|
||||
query={
|
||||
"driver": src_info["driver"],
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
return src_url
|
||||
|
||||
def _create_engine(self):
|
||||
try:
|
||||
engine = create_engine(self._get_source_url())
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
return engine
|
||||
|
||||
def read_query(self, query) -> pd.DataFrame:
|
||||
try:
|
||||
df = pd.read_sql(query, self._create_engine())
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
return df
|
||||
103
dft/base/files.py
Normal file
103
dft/base/files.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from prefect import task
|
||||
import os
|
||||
import re
|
||||
from jinja2 import Environment, PackageLoader
|
||||
from .config import config as cfg
|
||||
|
||||
|
||||
def _list_to_dict(input_list):
|
||||
result_dict = {}
|
||||
for item in input_list:
|
||||
if '=' in item:
|
||||
# Split the item on the first '=' and strip spaces and double quotes
|
||||
key, value = item.split('=', 1)
|
||||
key = key.strip()
|
||||
value = value.strip().strip('"')
|
||||
result_dict[key] = value
|
||||
return result_dict
|
||||
|
||||
def delete_file(directory, donot=[]):
|
||||
for root, dirs, files in os.walk(directory, topdown=False):
|
||||
for name in files:
|
||||
file_path = os.path.join(root, name)
|
||||
if name not in donot:
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
print(f"Removing file: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"Failed to remove {file_path}. Reason: {e}")
|
||||
|
||||
def generate_init(query_paths):
|
||||
def get_declaration(query_paths):
|
||||
model_names = []
|
||||
for filepath in query_paths:
|
||||
bn, filename, filetype = get_filename_from_path(filepath)
|
||||
model_names.append(f'from .{filename} import {filename}')
|
||||
return model_names
|
||||
|
||||
def render_init(declarations):
|
||||
try:
|
||||
env = Environment(loader=PackageLoader('dft', 'templates'))
|
||||
template = env.get_template('model_init.jinja2')
|
||||
|
||||
# Render the template with variables
|
||||
rendered_content = template.render(
|
||||
model_names=declarations
|
||||
)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
print("Init is rendered")
|
||||
return rendered_content
|
||||
|
||||
lst_model = get_declaration(query_paths)
|
||||
content = render_init(lst_model)
|
||||
try:
|
||||
with open(f'{cfg["migration-path"]}/models/__init__.py', 'w') as f:
|
||||
f.write(content)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
print(f"Init file is updated")
|
||||
return True
|
||||
|
||||
def get_filename_from_path(filepath):
|
||||
try:
|
||||
bn = os.path.basename(filepath)
|
||||
filename = os.path.splitext(bn)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
return bn, filename[0], filename[1]
|
||||
|
||||
def get_filepaths(directories):
|
||||
lst_filepaths = []
|
||||
# Walk through all directories and files in the given directory
|
||||
for dir in directories:
|
||||
for root, dirs, files in os.walk(dir):
|
||||
for file in files:
|
||||
# Create full path by joining root with file name
|
||||
full_path = os.path.join(root, file)
|
||||
lst_filepaths.append(full_path)
|
||||
return lst_filepaths
|
||||
|
||||
def process_query_file(filepath):
|
||||
with open(filepath, 'r') as file:
|
||||
content = file.read()
|
||||
|
||||
# Extract key-value pairs from within '{{ }}'
|
||||
match = re.search(r"{{\s*config\s*=\s*(\[[^\]]+\])\s*}}", content)
|
||||
if match:
|
||||
# Try to parse the extracted content into a Python dictionary
|
||||
config_str = match.group(1)
|
||||
config_str = re.sub(r'\s+', ' ', config_str).strip()
|
||||
config_str = config_str.strip('[]')
|
||||
config_lst = config_str.split(',')
|
||||
config = _list_to_dict(config_lst)
|
||||
else:
|
||||
config = None
|
||||
|
||||
content = re.sub(r"{{.*?}}", "", content, flags=re.DOTALL)
|
||||
content = re.sub(r'\s+', ' ', content).strip()
|
||||
|
||||
return config, content
|
||||
19
dft/base/loader.py
Normal file
19
dft/base/loader.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from .config import config as cfg
|
||||
import re
|
||||
|
||||
class SQLLoader():
|
||||
|
||||
def __init__(self, src_name, filepath) -> None:
|
||||
super().__init__(src_name, filepath)
|
||||
|
||||
def _get_initial_load_query(self, query_name, query):
|
||||
match = re.search(r'with ' + query_name + ' as \((.*?)\)', query, re.DOTALL)
|
||||
if match:
|
||||
initial_load_query = match.group(1).strip()
|
||||
return initial_load_query
|
||||
else:
|
||||
return None
|
||||
|
||||
def init_load(self):
|
||||
self._is_table_empty()
|
||||
self._get_initial_load_query(query_name, query)
|
||||
24
dft/base/migrator.py
Normal file
24
dft/base/migrator.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from prefect import task
|
||||
from alembic.config import Config
|
||||
from alembic import command
|
||||
from .config import config as cfg
|
||||
import yaml
|
||||
import os
|
||||
|
||||
def get_target_url():
|
||||
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
target = data[cfg["profile"]]['target']
|
||||
conn_info = data[cfg["profile"]][target]["load_to"][cfg["target-name"]]
|
||||
connection_url = f'mssql+pyodbc://{conn_info["user"]}:{conn_info["password"]}@{conn_info["host"]}:{conn_info["port"]}/{conn_info["database"]}?driver={conn_info["driver"].replace(" ", "+")}'
|
||||
return connection_url
|
||||
|
||||
def create_migration_file(script_message):
|
||||
alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
|
||||
alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
|
||||
command.revision(alembic_cfg, autogenerate=True, message=script_message)
|
||||
|
||||
def migrate():
|
||||
alembic_cfg = Config(f'{cfg["migration-path"]}/alembic.ini')
|
||||
alembic_cfg.set_main_option("sqlalchemy.url", get_target_url())
|
||||
command.upgrade(alembic_cfg, "head")
|
||||
9
dft/base/mixin.py
Normal file
9
dft/base/mixin.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from sqlalchemy import Column, Integer
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class DFTMixin():
|
||||
#TODO: add repeatable field for slow dim etc.
|
||||
Pk = Column("Pk", Integer, primary_key=True)
|
||||
batch_id = Column("BatchID", Integer, nullable=True)
|
||||
91
dft/base/modeler.py
Normal file
91
dft/base/modeler.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from . import files
|
||||
from jinja2 import Environment, PackageLoader
|
||||
from sqlalchemy import text
|
||||
from datetime import datetime, date, time
|
||||
from decimal import Decimal
|
||||
from .config import config as cfg
|
||||
from . import files
|
||||
|
||||
class Modeler():
|
||||
def __init__(self, src_name, filepath):
|
||||
self.name = src_name
|
||||
self.filepath = filepath
|
||||
self.bn, self.filename, self.filetype = files.get_filename_from_path(self.filepath)
|
||||
|
||||
def infer_type_from_value(self, value):
|
||||
if isinstance(value, int):
|
||||
return 'Integer' # Maps to INTEGER in SQL
|
||||
elif isinstance(value, str):
|
||||
return 'String' # Maps to VARCHAR in SQL
|
||||
elif isinstance(value, float):
|
||||
return 'Float' # Maps to FLOAT in SQL
|
||||
elif isinstance(value, Decimal):
|
||||
# Infer precision and scale for decimal values
|
||||
return f'Numeric(12, 3)' # Maps to NUMERIC in SQL
|
||||
elif isinstance(value, datetime):
|
||||
return 'DateTime' # Maps to DATETIME in SQL
|
||||
elif isinstance(value, date):
|
||||
return 'Date' # Maps to DATE in SQL
|
||||
elif isinstance(value, time):
|
||||
return 'Time' # Maps to TIME in SQL
|
||||
elif isinstance(value, bool):
|
||||
return 'Boolean' # Maps to BOOLEAN in SQL
|
||||
elif isinstance(value, bytes):
|
||||
return 'LargeBinary' # Maps to BLOB or BYTEA in SQL
|
||||
else:
|
||||
return 'String' # Default fall-back type
|
||||
|
||||
def render_model(self, columns):
|
||||
version = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
|
||||
try:
|
||||
env = Environment(loader=PackageLoader('dft', 'templates'))
|
||||
template = env.get_template('model_declaration.jinja2')
|
||||
|
||||
rendered_content = template.render(
|
||||
from_file = f"{(version)}_{str(self.filepath)}",
|
||||
migration_path=cfg["migration-path"],
|
||||
model_name=self.filename,
|
||||
revision_id=version,
|
||||
create_date=datetime.now().isoformat(),
|
||||
columns=columns
|
||||
)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
print(f"Model {self.filename} content is rendered")
|
||||
return rendered_content
|
||||
|
||||
def write_model_to_file(self, content):
|
||||
to_file = cfg["migration-path"] + f"/models/{self.filename}.py"
|
||||
try:
|
||||
# Write the rendered content to a file
|
||||
with open(to_file, 'w') as f:
|
||||
f.write(content)
|
||||
except Exception as e:
|
||||
return e
|
||||
else:
|
||||
print(f"Model {self.filename} has been written to file {to_file} with success !")
|
||||
return True
|
||||
|
||||
|
||||
class SQLModeler(Modeler):
|
||||
|
||||
def __init__(self, src_name, filepath):
|
||||
super().__init__(src_name, filepath)
|
||||
|
||||
def infer_model_columns(self, result):
|
||||
first_row = result.fetchone()
|
||||
columns = []
|
||||
for column, value in zip(result.keys(), first_row):
|
||||
sql_type = self.infer_type_from_value(value)
|
||||
column = f'{column} = Column("{column}", {sql_type})'
|
||||
columns.append(column)
|
||||
return columns
|
||||
|
||||
def generate_model(self, query):
|
||||
with self._create_engine().connect() as cnt:
|
||||
rst = cnt.execute(text(query))
|
||||
columns = self.infer_model_columns(rst)
|
||||
content = self.render_model(columns)
|
||||
self.write_model_to_file(content)
|
||||
30
dft/base/sources.py
Normal file
30
dft/base/sources.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import yaml
|
||||
import os
|
||||
from .connector import SQLConnector
|
||||
from .modeler import SQLModeler
|
||||
from .loader import SQLLoader
|
||||
from .config import config as cfg
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_source_type(cnt_name, action="extract_from"):
|
||||
with open(os.path.expanduser("~")+"/.dft/sources.yml",'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
target = data[cfg["profile"]]['target']
|
||||
return data[cfg["profile"]][target][action][cnt_name]["type"]
|
||||
|
||||
def get_source(src_name, filepath, action="extract_from"):
|
||||
src_type = get_source_type(src_name, action)
|
||||
#TODO: SelectConnector depending on other type
|
||||
if src_type == "sql":
|
||||
# SQLSource
|
||||
return SQLSource(src_name, filepath)
|
||||
else:
|
||||
return "Source type not supported for now."
|
||||
|
||||
|
||||
class SQLSource(SQLModeler, SQLConnector, SQLLoader):
|
||||
type = "sql"
|
||||
|
||||
def __init__(self, src_name, filepath) -> None:
|
||||
super().__init__(src_name, filepath)
|
||||
Reference in New Issue
Block a user