We use a custom USQL extractor to flatten a json structure. The below sample code works fine if line(json object) of json is less than 4 MB. If the line size is above 4 MB, then we get error "A record in the input file is longer than 4194304 bytes." The similar code is tried in C# stand alone application for lines higher than 4 MB, it works fine. Do we have any restriction on json size with usql custom extractor? How do we handle json messages with size more than 4 MB?
The error is thrown from the highlighted line in below code
string line = lineReader.ReadToEnd();
Custom extractor Sample code
using Microsoft.Analytics.Interfaces;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Microsoft.Analytics.Types.Sql;
using Newtonsoft.Json;
namespace Company.DataLakeAnalytics
{
[SqlUserDefinedExtractor(AtomicFileProcessing = false)]
public class CustomJSONExtractor : IExtractor
{
private readonly Encoding _encoding;
private readonly byte[] _row_delim;
private string DELIMITER = "~";
public CustomJSONExtractor(Encoding encoding = null, string row_delim = "\r\n")
{
_encoding = Encoding.UTF8;
_row_delim = _encoding.GetBytes(row_delim);
}
//Every json line in the raw file is transformed to a flat structure
public override IEnumerable Extract(IUnstructuredReader input, IUpdatableRow output)
{
//Read the input line by line
foreach (Stream current in input.Split(_row_delim))
{
using (StreamReader lineReader = new StreamReader(current, this._encoding))
{
//reads the entire line
string line = lineReader.ReadToEnd();
//break the line to multiple variables
output.Set(1, "A~1");
yield return output.AsReadOnly();
}
}
}
}
}
sample USQL code
DECLARE @INPUT_FILE="sample-data.txt"; @jsonDatafile = EXTRACT key string, jsonObjStr string FROM @INPUT_FILE USING new Damen.DataLakeAnalytics.CustomJSONExtractor(null,row_delim:"\n") ; @dataJsonObject = SELECT jsonObjStr AS rawData FROM @dataAsStrings; OUTPUT @dataJsonObject TO @flattenedOutputFile USING Outputters.Text(outputHeader:false,quoting: false,delimiter:'~');