I have setup a multi-node druid cluster with: 1) 1 node running as coordinator and overlord (m4.xl) 2) 2 nodes each running historical and middle managers both. (r3.2xl) 3) 1 node running broker (r3.2xl)
Now I have an EMR cluster running which I want to use for ingestion tasks, the problem is whenever I try to submit a job via the CURL command, the job always starts as local hadoop job in both the middle managers instead of being submitted to the remote EMR cluster. My data lies in S3 and also S3 is configured for deep storage as well.
I have also copied all the jars from EMR master to hadoop-dependencies/hadoop-client/2.7.3/
Druid version: 0.9.2 EMR version: 5.2
Please find attached indexing job, common runtime properties and middle manager runtime properties.
- Q1) How to get the job to submit to remote EMR cluster.
- Q2) Logs for the indexing task are not coming on overlord:8090, how to enable it.
File: data_index.json
{
"type": "index_hadoop",
"spec": {
"ioConfig": {
"type": "hadoop",
"inputSpec": {
"type": "static",
"paths": "s3n://<kjcnskd>smallTest"
}
},
"dataSchema": {
"dataSource": "multi_value_test_01",
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "day",
"queryGranularity": "none",
"intervals": [
"2011-09-12/2017-09-13"
]
},
"parser": {
"type": "string",
"parseSpec": {
"format": "tsv",
"delimiter": "\u0001",
"listDelimiter": "|",
"columns": [
"article_type",
"brand",
"gender",
"brand_type",
"master_category",
"supply_type",
"business_unit",
"testdim",
"date",
"week",
"month",
"year",
"style_id",
"live_styles",
"non_live_styles",
"broken_style",
"new_season_styles",
"live_styles_qty",
"non_live_styles_qty",
"broken_style_qty",
"new_season_styles_qty"
],
"dimensionsSpec": {
"dimensions": [
"article_type",
"brand",
"gender",
"brand_type",
"master_category",
"supply_type",
"business_unit",
"testdim",
"week",
"month",
"year",
"style_id"
]
},
"timestampSpec": {
"column": "date",
"format": "yyyyMMdd"
}
}
},
"metricsSpec": [
{
"name": "live_styles",
"type": "doubleSum",
"fieldName": "live_styles"
},
{
"name": "non_live_styles",
"type": "doubleSum",
"fieldName": "non_live_styles"
},
{
"name": "broken_style",
"type": "doubleSum",
"fieldName": "broken_style"
},
{
"name": "new_season_styles",
"type": "doubleSum",
"fieldName": "new_season_styles"
},
{
"name": "live_styles_qty",
"type": "doubleSum",
"fieldName": "live_styles_qty"
},
{
"name": "broken_style_qty",
"type": "doubleSum",
"fieldName": "broken_style_qty"
},
{
"name": "new_season_styles_qty",
"type": "doubleSum",
"fieldName": "new_season_styles_qty"
}
]
},
"tuningConfig": {
"type": "hadoop",
"partitionsSpec": {
"type": "hashed",
"targetPartitionSize": 5000000
},
"jobProperties": {
"fs.s3.awsAccessKeyId": "XXXXXXXXXXXXXX",
"fs.s3.awsSecretAccessKey": "XXXXXXXXXXXXXX",
"fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"fs.s3n.awsAccessKeyId": "XXXXXXXXXXXXXX",
"fs.s3n.awsSecretAccessKey": "XXXXXXXXXXXXXX",
"fs.s3n.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"io.compression.codecs": "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec"
}
}
}
}
File: common.runtime.properties
#
# Licensed to Metamarkets Group Inc. (Metamarkets) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. Metamarkets licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#
# Extensions
#
# This is not the full list of Druid extensions, but common ones that people often use. You may need to change this list
# based on your particular setup.
druid.extensions.loadList=["druid-kafka-eight", "druid-s3-extensions", "druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "mysql-metadata-storage"]
# If you have a different version of Hadoop, place your Hadoop client jar files in your hadoop-dependencies directory
# and uncomment the line below to point to your directory.
druid.extensions.hadoopDependenciesDir=hadoop-dependencies/hadoop-client/2.7.3
#
# Logging
#
# Log all runtime properties on startup. Disable to avoid logging properties on startup:
druid.startup.logging.logProperties=true
#
# Zookeeper
#
druid.zk.service.host=10.0.1.152
druid.zk.paths.base=/druid
#
# Metadata storage
#
# For Derby server on your Druid Coordinator (only viable in a cluster with a single Coordinator, no fail-over):
#druid.metadata.storage.type=derby
#druid.metadata.storage.connector.connectURI=jdbc:derby://metadata.store.ip:1527/var/druid/metadata.db;create=true
#druid.metadata.storage.connector.host=metadata.store.ip
#druid.metadata.storage.connector.port=1527
# For MySQL:
druid.metadata.storage.type=mysql
druid.metadata.storage.connector.connectURI=jdbc:mysql://10.0.1.140:3306/druid
druid.metadata.storage.connector.user=druid
druid.metadata.storage.connector.password=druid123
# For PostgreSQL (make sure to additionally include the Postgres extension):
#druid.metadata.storage.type=postgresql
#druid.metadata.storage.connector.connectURI=jdbc:postgresql://db.example.com:5432/druid
#druid.metadata.storage.connector.user=...
#druid.metadata.storage.connector.password=...
#
# Deep storage
#
# For local disk (only viable in a cluster if this is a network mount):
#druid.storage.type=local
#druid.storage.storageDirectory=var/druid/segments
# For HDFS (make sure to include the HDFS extension and that your Hadoop config files in the cp):
#druid.storage.type=hdfs
#druid.storage.storageDirectory=/druid/segments
# For S3:
druid.storage.type=s3
druid.storage.bucket=asfvdcs
druid.storage.baseKey=druid/segments
druid.s3.accessKey=XXXXXXXXXXXX
druid.s3.secretKey=XXXXXXXXXXXX
#
# Indexing service logs
#
# For local disk (only viable in a cluster if this is a network mount):
druid.indexer.logs.type=file
druid.indexer.logs.directory=var/druid/indexing-logs
# For HDFS (make sure to include the HDFS extension and that your Hadoop config files in the cp):
#druid.indexer.logs.type=hdfs
#druid.indexer.logs.directory=/druid/indexing-logs
# For S3:
#druid.indexer.logs.type=s3
#druid.indexer.logs.s3Bucket=testashutosh
#druid.indexer.logs.s3Prefix=druid/indexing-logs
#
# Service discovery
#
druid.selectors.indexing.serviceName=druid/overlord
druid.selectors.coordinator.serviceName=druid/coordinator
#
# Monitoring
#
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor"]
druid.emitter=logging
druid.emitter.logging.logLevel=info
File: middle manager runtime.properties
druid.service=druid/middleManager
druid.port=8091
# Number of tasks per middleManager
druid.worker.capacity=3
# Task launch parameters
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.indexer.task.baseTaskDir=var/druid/task
# HTTP server threads
druid.server.http.numThreads=25
# Processing threads and buffers
druid.processing.buffer.sizeBytes=536870912
druid.processing.numThreads=2
# Hadoop indexing
druid.indexer.task.hadoopWorkingPath=hdfs://ip-10-0-1-xxx.ap-southeast-1.compute.internal:8020/tmp/druid-indexing
druid.indexer.task.defaultHadoopCoordinates=["org.apache.hadoop:hadoop-client:2.7.3"]
druid.indexer.runner.type=remote