skit_pipelines.components package

Subpackages

Submodules

skit_pipelines.components.asr_transcription module

audio_transcription(audios_dir_path: <kfp.components._python_op.InputPath object at 0x1076db2e0>, config_path: <kfp.components._python_op.InputPath object at 0x1076db490>, output_path: <kfp.components._python_op.OutputPath object at 0x1076db520>, concurrency: int) None[source]
audio_transcription_op(audios_dir: str, config: str, concurrency: int)

Audio transcription

skit_pipelines.components.asr_tune module

asr_tune(corpus_path: <kfp.components._python_op.InputPath object at 0x1092883a0>, val_corpus_path: <kfp.components._python_op.InputPath object at 0x1093649a0>, augment_wordlist_path: <kfp.components._python_op.InputPath object at 0x109395610>, remove_wordlist_path: <kfp.components._python_op.InputPath object at 0x109395940>, base_model_path: <kfp.components._python_op.InputPath object at 0x1093959d0>, general_lm_path: <kfp.components._python_op.InputPath object at 0x109395a30>, output_path: <kfp.components._python_op.OutputPath object at 0x109395a90>, lang: str) None[source]
asr_tune_op(corpus: str, val_corpus: str, augment_wordlist: str, remove_wordlist: str, base_model: str, general_lm: str, lang: str)

Asr tune

skit_pipelines.components.audio_download module

download_audio_wavs(audio_data_path: <kfp.components._python_op.InputPath object at 0x1093cf430>, audio_sample_rate: str, audio_download_workers: int, output_path: <kfp.components._python_op.OutputPath object at 0x1093be0d0>) None[source]
download_audio_wavs_op(audio_data: str, audio_sample_rate: str, audio_download_workers: int)

Download audio wavs

skit_pipelines.components.auth module

org_auth_token(org_id: str, url: Optional[str] = None) str[source]
org_auth_token_op(org_id: str, url: str = None)

Org auth token

skit_pipelines.components.create_mr module

create_mr(git_host_name: str, repo_name: str, project_path: str, target_branch: str, source_branch: str, mr_title: str, s3_description_paths: str) str[source]
create_mr_op(git_host_name: str, repo_name: str, project_path: str, target_branch: str, source_branch: str, mr_title: str, s3_description_paths: str)

Create mr

skit_pipelines.components.download_from_s3 module

download_csv_from_s3(*, storage_path: str, empty_possible: bool = False, output_path: <kfp.components._python_op.OutputPath object at 0x10961f2e0>) None[source]
download_csv_from_s3_op(storage_path: str, empty_possible: bool = 'False')

Download csv from s3

download_directory_from_s3(*, storage_path: str, output_path: <kfp.components._python_op.OutputPath object at 0x109619400>) None[source]
download_directory_from_s3_op(storage_path: str)

Download directory from s3

download_file_from_s3(*, storage_path: str, storage_options: str = '', empty_possible: bool = False, output_path: <kfp.components._python_op.OutputPath object at 0x1076db280>) None[source]
download_file_from_s3_op(storage_path: str, storage_options: str = '', empty_possible: bool = 'False')

Download file from s3

skit_pipelines.components.download_repo module

download_repo(*, git_host_name: str, repo_name: str, project_path: str, repo_path: <kfp.components._python_op.OutputPath object at 0x1096c2460>) None[source]
download_repo_op(git_host_name: str, repo_name: str, project_path: str)

Download repo

skit_pipelines.components.download_yaml module

download_yaml(git_host_name: str, yaml_path: str, output_path: <kfp.components._python_op.OutputPath object at 0x1096efa60>)[source]
download_yaml_op(git_host_name: str, yaml_path: str)

Download yaml

skit_pipelines.components.fetch_calls module

fetch_calls(*, lang: str, start_date: str, end_date: Optional[str] = None, client_id: Optional[str] = None, start_date_offset: int = 0, end_date_offset: int = 0, start_time_offset: int = 0, end_time_offset: int = 0, call_quantity: int = 200, call_type: Optional[str] = None, timezone: Optional[str] = None, ignore_callers: Optional[str] = None, reported: bool = False, template_id: Optional[str] = None, use_case: Optional[str] = None, flow_name: Optional[str] = None, min_duration: Optional[str] = None, asr_provider: Optional[str] = None, intents: Optional[str] = None, states: Optional[str] = None, calls_file_s3_path: Optional[str] = None, use_fsm_url: bool = False, remove_empty_audios: bool = True, flow_ids: Optional[str] = None) str[source]
fetch_calls_op(lang: str, start_date: str, end_date: str = None, client_id: str = None, start_date_offset: int = '0', end_date_offset: int = '0', start_time_offset: int = '0', end_time_offset: int = '0', call_quantity: int = '200', call_type: str = None, timezone: str = None, ignore_callers: str = None, reported: bool = 'False', template_id: str = None, use_case: str = None, flow_name: str = None, min_duration: str = None, asr_provider: str = None, intents: str = None, states: str = None, calls_file_s3_path: str = None, use_fsm_url: bool = 'False', remove_empty_audios: bool = 'True', flow_ids: str = None)

Fetch calls

skit_pipelines.components.fetch_tagged_dataset module

fetch_tagged_dataset(output_path: <kfp.components._python_op.OutputPath object at 0x109697ac0>, job_id: typing.Optional[str] = None, project_id: typing.Optional[str] = None, task_type: str = 'conversation', timezone: typing.Optional[str] = None, start_date: typing.Optional[str] = None, end_date: typing.Optional[str] = None, start_date_offset: typing.Optional[int] = None, end_date_offset: typing.Optional[int] = None, empty_possible: bool = False)[source]
fetch_tagged_dataset_op(job_id: str = None, project_id: str = None, task_type: str = 'conversation', timezone: str = None, start_date: str = None, end_date: str = None, start_date_offset: int = None, end_date_offset: int = None, empty_possible: bool = 'False')

Fetch tagged dataset

skit_pipelines.components.file_contents_to_markdown_s3 module

file_contents_to_markdown_s3(ext: str, path_on_disk: <kfp.components._python_op.InputPath object at 0x109690d90>, file_title: str = '') str[source]
file_contents_to_markdown_s3_op(ext: str, path_on_disk: str, file_title: str = '')

File contents to markdown s3

skit_pipelines.components.gen_asr_metrics module

gen_asr_metrics(data_path: <kfp.components._python_op.InputPath object at 0x1096975e0>, output_path: <kfp.components._python_op.OutputPath object at 0x109697820>, true_label_column: str = 'transcript_y', pred_label_column: str = 'utterances')[source]
gen_asr_metrics_op(data: str, true_label_column: str = 'transcript_y', pred_label_column: str = 'utterances')

Gen asr metrics

skit_pipelines.components.merge_transcription module

overlay_transcription_csv(sqlite_path: <kfp.components._python_op.InputPath object at 0x109801280>, original_csv_path: <kfp.components._python_op.InputPath object at 0x1098013d0>, output_path: <kfp.components._python_op.OutputPath object at 0x1097a7a30>) None[source]
overlay_transcription_csv_op(sqlite: str, original_csv: str)

Overlay transcription csv

skit_pipelines.components.modify_tagged_entities module

modify_entity_dataset(data_path: <kfp.components._python_op.InputPath object at 0x109801fa0>, output_path: <kfp.components._python_op.OutputPath object at 0x109801ca0>, tog_job_id: typing.Optional[str] = None, labelstudio_project_id: typing.Optional[str] = None, timezone: str = 'Asia/Kolkata')[source]

Takes a entity dataset and, 1) hits duckling service for inference on ground-truth 2) modifies the predicted entity structure to be consistent

modify_entity_dataset_op(data: str, tog_job_id: str = None, labelstudio_project_id: str = None, timezone: str = 'Asia/Kolkata')

Modify entity dataset Takes a entity dataset and,

skit_pipelines.components.notification module

slack_notification(message: str, code_block: str = '', channel: str = '', cc: str = '', thread_id: str = '', file_title: str = '', file_content: str = '') None[source]

Send a message on any channel.

slack_notification_op(message: str, code_block: str = '', channel: str = '', cc: str = '', thread_id: str = '', file_title: str = '', file_content: str = '')

Slack notification Send a message on any channel.

skit_pipelines.components.push_compliance_report_to_postgres module

push_compliance_report_to_postgres(s3_file_path: str) int[source]
push_compliance_report_to_postgres_op(s3_file_path: str)

Push compliance report to postgres

skit_pipelines.components.re_presign_s3_urls module

re_presign_s3_urls(audio_data_path: <kfp.components._python_op.InputPath object at 0x1096f8490>, output_path: <kfp.components._python_op.OutputPath object at 0x10970ba60>) None[source]
re_presign_s3_urls_op(audio_data: str)

Re presign s3 urls

skit_pipelines.components.read_json_key module

read_json_key(req_value: str, input_file: <kfp.components._python_op.InputPath object at 0x109746d60>) Any[source]
read_json_key_op(req_value: str, input: str)

Read json key

skit_pipelines.components.retrain_slu_from_repo_old module

retrain_slu_from_repo(*, s3_data_path: <kfp.components._python_op.InputPath object at 0x109653850>, annotated_job_data_path: <kfp.components._python_op.InputPath object at 0x109653520>, slu_path: <kfp.components._python_op.InputPath object at 0x109653250>, intent_alias_path: <kfp.components._python_op.InputPath object at 0x109653790>, bucket: str, repo_name: str, branch: str, remove_intents: str = '', use_previous_dataset: bool = True, train_split_percent: int = 85, stratify: bool = False, epochs: int = 10, initial_training: bool = False, job_ids: str = '', labelstudio_project_ids: str = '', s3_paths: str = '', validate_setup: bool = False, output_classification_report_path: <kfp.components._python_op.OutputPath object at 0x1096531f0>, output_confusion_matrix_path: <kfp.components._python_op.OutputPath object at 0x1096536a0>, customization_repo_name: str = '', customization_repo_branch: str = '') str[source]
retrain_slu_from_repo_op_old(s3_data: str, annotated_job_data: str, slu: str, intent_alias: str, bucket: str, repo_name: str, branch: str, remove_intents: str = '', use_previous_dataset: bool = 'True', train_split_percent: int = '85', stratify: bool = 'False', epochs: int = '10', initial_training: bool = 'False', job_ids: str = '', labelstudio_project_ids: str = '', s3_paths: str = '', validate_setup: bool = 'False', customization_repo_name: str = '', customization_repo_branch: str = '')

Retrain slu from repo

skit_pipelines.components.upload2s3 module

upload2s3(path_on_disk: <kfp.components._python_op.InputPath object at 0x109796f40>, reference: str = '', file_type: str = '', bucket: str = '', ext: str = '.csv', output_path: str = '', storage_options: str = '', upload_as_directory: bool = False) str[source]
upload2s3_op(path_on_disk: str, reference: str = '', file_type: str = '', bucket: str = '', ext: str = '.csv', output_path: str = '', storage_options: str = '', upload_as_directory: bool = 'False')

Upload2s3

skit_pipelines.components.utils module

alias_dataset(dataset_path: str, alias_yaml_path: str, intent_col: str = 'tag') None[source]
create_dataset_path(data_type, dataset_type)
evaluate(test_dataset_path, project_config_local_path, core_slu_repo_name, repo_name)[source]

To evaluate a model on a test set.

execute_cli(cmd, split=True)
filter_dataset(dataset_path: str, remove_intents_list: List[str], intent_col: str = 'tag') None[source]
pick_1st_tag(tag: str)[source]

skit_pipelines.components.utils_slu module

data_handler(annotated_job_data_path, s3_data_path)[source]
handle_dvc_and_data_paths(repo, project_config_local_path, bucket, repo_name, initial_training, core_slu_repo_name, use_previous_dataset)[source]
prepare_data(tagged_data_path, core_slu_repo_name, project_config_local_path, repo_name, custom_test_dataset_present, use_previous_dataset, train_split_percent, stratify)[source]

Prepare training and testing datasets.

process_custom_test_dataset(custom_test_s3_data_path)[source]
setup_project_config_repo(repo_name, branch)[source]

Setup project config repo.

setup_repo(repo_name, repo_branch, run_dir=None, run_cmd=None, runtime_env_var=None)[source]

Download a SLU repo and install all necessary dependencies (using conda) as found in its dockerfile.

testing(repo_name, project_config_local_path, final_test_dataset_path, remove_intents, intent_alias_path, core_slu_repo_name, comparison_classification_report_path, comparison_confusion_matrix_path, compare_branch='master')[source]

skit_pipelines.components.zip_files_and_notify module

zip_file_and_notify(path_on_disk: <kfp.components._python_op.InputPath object at 0x10988a130>, message: str, channel: str = '', thread_id: str = '', file_title: str = '', file_name: str = '', notify: str = '', display_sample: bool = False)[source]

Zip a file or folder and upload the same on slack :param message: the slack message to be sent :param channel: the channel in which the message is to be sent :param thread_id: the thread to which the message must be added :param file_title: Title for the file :param file_name: name of the file :param notify: Whether to send a slack notification :param display_sample: Set it as true to display the value in the file

zip_file_and_notify_op(path_on_disk: str, message: str, channel: str = '', thread_id: str = '', file_title: str = '', file_name: str = '', notify: str = '', display_sample: bool = 'False')

Zip file and notify Zip a file or folder and upload the same on slack

Module contents

asr_tune_op(corpus: str, val_corpus: str, augment_wordlist: str, remove_wordlist: str, base_model: str, general_lm: str, lang: str)

Asr tune

audio_transcription_op(audios_dir: str, config: str, concurrency: int)

Audio transcription

create_mr_op(git_host_name: str, repo_name: str, project_path: str, target_branch: str, source_branch: str, mr_title: str, s3_description_paths: str)

Create mr

create_true_transcript_labels_op(data: str, true_label_column: str)

Create true transcript labels

create_utterances_op(data: str)

Create utterances

download_audio_wavs_op(audio_data: str, audio_sample_rate: str, audio_download_workers: int)

Download audio wavs

download_csv_from_s3_op(storage_path: str, empty_possible: bool = 'False')

Download csv from s3

download_directory_from_s3_op(storage_path: str)

Download directory from s3

download_file_from_s3_op(storage_path: str, storage_options: str = '', empty_possible: bool = 'False')

Download file from s3

download_repo_op(git_host_name: str, repo_name: str, project_path: str)

Download repo

download_yaml_op(git_host_name: str, yaml_path: str)

Download yaml

evalution_slu_from_repo_op(s3_data: str, annotated_job_data: str, intent_alias: str, bucket: str, repo_name: str, compare_branch: str, branch: str, remove_intents: str = '', validate_setup: bool = 'False', customization_repo_name: str = '', customization_repo_branch: str = '', core_slu_repo_name: str = '', core_slu_repo_branch: str = '')

Evaluate slu from repo

extract_true_transcript_labels_to_txt_op(data: str, true_label_column: str)

Extract true transcript labels to txt

fetch_calls_for_slots_op(untagged_records_path: str, org_id: str = '', language_code='', start_date='', end_date='')

Fetch calls for slots

fetch_calls_op(lang: str, start_date: str, end_date: str = None, client_id: str = None, start_date_offset: int = '0', end_date_offset: int = '0', start_time_offset: int = '0', end_time_offset: int = '0', call_quantity: int = '200', call_type: str = None, timezone: str = None, ignore_callers: str = None, reported: bool = 'False', template_id: str = None, use_case: str = None, flow_name: str = None, min_duration: str = None, asr_provider: str = None, intents: str = None, states: str = None, calls_file_s3_path: str = None, use_fsm_url: bool = 'False', remove_empty_audios: bool = 'True', flow_ids: str = None)

Fetch calls

fetch_gpt_intent_prediction_op(s3_file_path: str, use_assisted_annotation: bool)

Fetch gpt intent prediction

fetch_tagged_data_label_store_op(start_date: str, flow_id: str, end_date: str = None, limit: int = '200', data_labels: str = '')

Fetch tagged data label store

fetch_tagged_dataset_op(job_id: str = None, project_id: str = None, task_type: str = 'conversation', timezone: str = None, start_date: str = None, end_date: str = None, start_date_offset: int = None, end_date_offset: int = None, empty_possible: bool = 'False')

Fetch tagged dataset

file_contents_to_markdown_s3_op(ext: str, path_on_disk: str, file_title: str = '')

File contents to markdown s3

final_conversation_generator_op(situation_info_list: List[Dict[str, str]], s3_links_to_prompts: str, n_iter: int, n_choice: int, temperature: float, model: str, llm_trainer_repo_name: str, llm_trainer_repo_branch: str)

Final conversation generator

gen_asr_metrics_op(data: str, true_label_column: str = 'transcript_y', pred_label_column: str = 'utterances')

Gen asr metrics

identify_compliance_breaches_llm_op(s3_file_path: str)

Identify compliance breaches llm Groups turns into calls and pushes them to an LLM (uses openai chatComplete functionality) to identify

invalidate_situations_in_db_op(situation_id)

Invalidate situations in db Check if the situation exists in db, if exists return the id else insert the situation to db and return the id

modify_entity_dataset_op(data: str, tog_job_id: str = None, labelstudio_project_id: str = None, timezone: str = 'Asia/Kolkata')

Modify entity dataset Takes a entity dataset and,

org_auth_token_op(org_id: str, url: str = None)

Org auth token

overlay_transcription_csv_op(sqlite: str, original_csv: str)

Overlay transcription csv

process_true_transcript_labels_op(data: str, true_label_column: str)

Process true transcript labels

push_compliance_report_to_postgres_op(s3_file_path: str)

Push compliance report to postgres

re_presign_s3_urls_op(audio_data: str)

Re presign s3 urls

read_json_key_op(req_value: str, input: str)

Read json key

retrain_slu_from_repo_op(s3_data: str, custom_test_s3_data: str, annotated_job_data: str, intent_alias: str, bucket: str, repo_name: str, branch: str, remove_intents: str = '', use_previous_dataset: bool = 'True', train_split_percent: int = '85', stratify: bool = 'False', epochs: int = '10', initial_training: bool = 'False', labelstudio_project_ids: str = '', s3_paths: str = '', validate_setup: bool = 'False', customization_repo_name: str = '', customization_repo_branch: str = '', core_slu_repo_name: str = '', core_slu_repo_branch: str = '')

Retrain slu from repo

retrain_slu_from_repo_op_old(s3_data: str, annotated_job_data: str, slu: str, intent_alias: str, bucket: str, repo_name: str, branch: str, remove_intents: str = '', use_previous_dataset: bool = 'True', train_split_percent: int = '85', stratify: bool = 'False', epochs: int = '10', initial_training: bool = 'False', job_ids: str = '', labelstudio_project_ids: str = '', s3_paths: str = '', validate_setup: bool = 'False', customization_repo_name: str = '', customization_repo_branch: str = '')

Retrain slu from repo

sample_conversations_generator_op(filename: str, prompt_file_path: str, n_iter: int, n_choice: int, temperature: float, model: str, llm_trainer_repo_name: str, llm_trainer_repo_branch: str, situation_file_path: str = '', situations: str = None)

Sample conversations generator

slack_notification_op(message: str, code_block: str = '', channel: str = '', cc: str = '', thread_id: str = '', file_title: str = '', file_content: str = '')

Slack notification Send a message on any channel.

tag_calls_op(input_file: str, data_label: str = '', project_id: str = None, call_project_id: str = None)

Tag calls

upload2s3_op(path_on_disk: str, reference: str = '', file_type: str = '', bucket: str = '', ext: str = '.csv', output_path: str = '', storage_options: str = '', upload_as_directory: bool = 'False')

Upload2s3

upload_conv_to_label_studio_op(project_id: str, conversations_dir: str, data_label: str, situations_id_info: List[Dict[str, str]])

Upload conv to label studio

upload_conversation_data_to_metrics_db_op(situations_id_info: List[Dict[str, str]], client_id: str, template_id: str, generated_conversations_s3_link: str, prompt_links_in_s3: str, conv_directory: str)

Upload conversation data to metrics db Upload the conversation data to metrics DB

validate_and_add_situations_to_db_op(situations: str, scenario: str, scenario_category: str)

Validate and add situations to db Check if the situation exists in db, if exists return the id else insert the situation to db and return the id

zip_file_and_notify_op(path_on_disk: str, message: str, channel: str = '', thread_id: str = '', file_title: str = '', file_name: str = '', notify: str = '', display_sample: bool = 'False')

Zip file and notify Zip a file or folder and upload the same on slack