I have one source CSV file (without header) as well as header config CSV file (contains only column names) in GCS. I also have static table in Bigquery. I want to load source file into static table by using column header mapping (config file).
I was tried with different approach earlier(I was maintain source file which contain header and data in same file and then tried to split header from source file then insert those data into Bigquery by using header column mapping. I noticed this approach is NOT possible because dataflow shuffle data into multiple worker node. so i dropped this approach.
The below code i have used hard coded column names. I am looking approach to read column names from external config file (I want to make my code as dynamic).
package com.coe.cog;
import java.io.BufferedReader;
import java.util.*;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
public class SampleTest {
private static final Logger LOG = LoggerFactory.getLogger(SampleTest.class);
public static TableReference getGCDSTableReference() {
TableReference ref = new TableReference();
ref.setProjectId("myownproject");
ref.setDatasetId("DS_Employee");
ref.setTableId("tLoad14");
return ref;
}
static class TransformToTable extends DoFn<String, TableRow> {
@ProcessElement
public void processElement(ProcessContext c) {
String csvSplitBy = ",";
String lineHeader = "ID,NAME,AGE,SEX"; // Hard code column name but i want to read these header from GCS file.
String[] colmnsHeader = lineHeader.split(csvSplitBy); //Only Header array
String[] split = c.element().split(csvSplitBy); //Data section
TableRow row = new TableRow();
for (int i = 0; i < split.length; i++) {
row.set(colmnsHeader[i], split[i]);
}
c.output(row);
// }
}
}
public interface MyOptions extends PipelineOptions {
/*
* Param
*
*/
}
public static void main(String[] args) {
MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class);
options.setTempLocation("gs://demo-bucket-data/temp");
Pipeline p = Pipeline.create(options);
PCollection<String> lines = p.apply("Read From Storage", TextIO.read().from("gs://demo-bucket-data/Demo/Test/SourceFile_WithOutHeader.csv"));
PCollection<TableRow> rows = lines.apply("Transform To Table",ParDo.of(new TransformToTable()));
rows.apply("Write To Table",BigQueryIO.writeTableRows().to(getGCDSTableReference())
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER));
p.run();
}
}
Source File:
1,John,25,M
2,Smith,30,M
3,Josephine,20,F
Config File (Headers only):
ID,NAME,AGE,SEX