As I am working with two clouds, My task is to rsync files coming into s3 bucket to gcs bucket. To achieve this I am using GCP composer (Airflow) service where I am scheduling this rsync operation to sync files. I am using Airflow connection (aws_default) to store AWS access key and secret access key. Everything is working fine but thing is that I am able to see credentials in logs which is again exposing credentials and I don't want to display them even in logs. Please assist if there is any way so that credentials should not display in logs.
import airflow
import datetime
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.hooks.base_hook import BaseHook
from datetime import timedelta, datetime
START_TIME = datetime.utcnow() - timedelta(hours=1)
default_args = {
'owner': 'airflow',
'depends_on_past': True,
'wait_for_downstream': True,
'start_date': START_TIME,
'email_on_failure': False,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=3)
}
aws_connection = BaseHook.get_connection('aws_default')
bash_env = {
"AWS_ACCESS_KEY_ID": aws_connection.login,
"AWS_SECRET_ACCESS_KEY": aws_connection.password
}
rsync_command = '''
set -e;
export AWS_ACCESS_KEY_ID="%s";
export AWS_SECRET_ACCESS_KEY="%s";
''' %(bash_env.get('AWS_ACCESS_KEY_ID'), bash_env.get('AWS_SECRET_ACCESS_KEY')) \
+ '''
gsutil -m rsync -r -n s3://aws_bucket/{{ execution_date.strftime('%Y/%m/%d/%H') }}/ gs://gcp_bucket/good/test/
'''
dag = DAG(
'rsync',
default_args=default_args,
description='This dag is for gsutil rsync from s3 buket to gcs storage',
schedule_interval=timedelta(minutes=20),
dagrun_timeout=timedelta(minutes=15)
)
s3_sync = BashOperator(
task_id='gsutil_s3_gcp_sync',
bash_command=rsync_command,
dag=dag,
depends_on_past=False,
execution_timeout=timedelta(hours=1),
)