Read multiple xls files from zip file - apache-poi

I'm trying to read multiple xls files that are in a .zip file using the code below. I'm getting missed end of block error. Do I need to add a EOF kind of character at the end of the byte array before I send it to poi?
The code below:
val zipStream=new ZipInputStream(inputStream)
var zipEntry = null
while(zipEntry = zipStream.getNextEntry != null){
val bytes=new Array[Byte](zipEntry.getSize.toInt)
zipStream.readBytes(bytes)
val xlsByteStream = new ByteArrayInputStream(bytes)
val workbook = new XSSFWorkbook(xlsByteStream)
}

val zipStream=new ZipInputStream(inputStream)
var zipEntry = null
while({zipEntry = zipStream.getNextEntry; zipEntry != null})
{
val bytes=new Array[Byte](1024)
val bos = new ByteArrayOuputStream(zipEntry.getSize.toInt)
while({i=zipStream.read(bytes);i>0}) {
bos.write(bytes, 0, i)
}
bos.close
val xlsByteStream = new ByteArrayInputStream(bos.toByteArray)
val workbook = new XSSFWorkbook(xlsByteStream)
}

Related

How to correctly read the corresponding columnChunk in parquet according to the specified schema?

1. I use a custom sparkSQL plugin to start a spark-shell terminal and execute the following command in it
import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
sc.setLogLevel("WARN")
val spark: SparkSession = SparkSession.builder().appName("Test").config("parquet.enable.dictionary","false").getOrCreate()
val res = spark.sql("SELECT USER_ID FROM TEST_PARQUET_10G where USER_ID >= 0 and HOUR_ID > 0")
res.collect()
2. This is my table creation statement
CREATE
TABLE
TEST_PARQUET_10G(
USER_ID BIGINT,
SERIAL_NUMBER BIGINT,
KAFKA_TIME BIGINT,
ACCT_DATE BIGINT,
HOUR_ID BIGINT
)
COMMENT 'Long类型'
STORED AS PARQUET
TBLPROPERTIES('parquet.compression'='none')
LOCATION
'/user/hive/warehouse/hanlei/Long_PARQUET/TEST_PARQUET_10G'
3. I customized a sql plug-in, in which I rewritten the rules for reading paquet files this is my reader, in this step, I set USER_ID ADN HOUR_ID in my reader
this is my reader code
private def createParquetFileReader(file: PartitionedFile): ParquetFileReader = {
val conf = broadcastedConf.value.value
val filePath = new Path(file.filePath)
val split = new FileSplit(filePath, file.start, file.length, Array.empty[String])
val reader = ParquetFileReader.open(HadoopInputFile.fromPath(filePath,conf))
val requiredNameArr = readDataSchema.fieldNames
val requiredSchemaName:Set[String] = requiredNameArr.toSet
val fields: util.List[Type] = reader.getFileMetaData.getSchema.getFields
fields.removeIf{
field =>
!requiredSchemaName.contains(field.getName)
}
requiredSchema = new MessageType("requiredSchema", fields)
reader.setRequestedSchema(requiredSchema)
reader
}
4. But when I call the readNextRowGroup method of the reader, the program reports this error
code from here
class ParquetDataPagesPartitionReader(
parquetReader: ParquetFileReader,
readDataSchema: StructType,
requiredSchema: MessageType
) extends PartitionReader[ColumnarBatch]
with ScanWithMetrics
with Arm
with Logging {
private var batch: Option[ColumnarBatch] = None
//private val columns: Iterator[String] = readDataSchema.fieldNames.iterator
var rowGroup: PageReadStore = _
val columns: util.List[ColumnDescriptor] = requiredSchema.getColumns
override def next(): Boolean = {
batch.foreach(_.close())
rowGroup = parquetReader.readNextRowGroup()
batch = if ( rowGroup != null) {
readBatch()
} else {
None
}
batch.isDefined
}
// one batch is all columnChunk's dataPages from a rowGroup
private def readBatch(): Option[ColumnarBatch] = {
val starTime = System.currentTimeMillis()
// val starTime = System.nanoTime()
logError(s"startTime ${starTime}")
val dpuColumnVectors: Array[DpuColumnVector] = new Array[DpuColumnVector](columns.size())
for (i <- 0 until readDataSchema.length) {
val raceDType = DpuColumnVector.getRaceDataType(readDataSchema(i).dataType)
val raceVec = new RaceColumnVector(raceDType, DpuBatchUtils.DPU_MAX_ROWS)
logDebug(s"try to malloc ${i} column")
raceVec.setRowSizeAndMalloc()
logDebug(s"successful malloc ${i} column")
dpuColumnVectors(i) = new DpuColumnVector(readDataSchema(i).dataType, raceVec)
}
var useTime: Long = 0
for (index <- 0 until readDataSchema.length) {
var offset: Int = 0
val pageReader = rowGroup.getPageReader(columns.get(index))
var page: DataPageV1 = pageReader.readPage.asInstanceOf[DataPageV1]
logDebug(s"appending data to column ${index}")
while (page != null && offset < DpuBatchUtils.DPU_MAX_ROWS) {
dpuColumnVectors(index).dataType() match {
case LongType =>
val startReadingIntoMemory = System.currentTimeMillis()
val byteArray: Array[Byte] = page.getBytes.toByteArray
val bytes: Array[Byte] = byteArray.slice(8, byteArray.length)
val endReadIntoMemory = System.currentTimeMillis()
//logError(s"successful read into memory column: ${columns.get(index).getPath.apply(0)} use time: ${endReadIntoMemory - startReadingIntoMemory}")
dpuColumnVectors(index).getRaceColumnVector
// todo: The byte array here is not decoded
.appendValuesFromBuffer(bytes, 0, page.getValueCount)
val endAppendToRace = System.currentTimeMillis()
//logError(s"successful read into race, appended ${page.getValueCount} rows from ${offset} use time ${endAppendToRace - endReadIntoMemory}")
case _ => throw new RuntimeException(RaceConstant.UNSUPPORTED_DATA_TYPE)
}
offset += page.getValueCount
page = pageReader.readPage.asInstanceOf[DataPageV1]
}
}
val endTime = System.currentTimeMillis()
// val endTime = System.nanoTime()
// useTime = useTime + endTime - starTime
val columnarBatch = new ColumnarBatch(dpuColumnVectors.toArray, rowGroup.getRowCount.toInt)
logError(s"endTime: ${endTime}")
logError(s"read batch success, write to race successful,batchRows: ${rowGroup.getRowCount}, column number: ${columns.size()}, use time: ${endTime - starTime}")
Some(columnarBatch)
}
override def get(): ColumnarBatch = {
val ret = batch.getOrElse(throw new NoSuchElementException)
batch = None
ret
}
override def close(): Unit = {
if (parquetReader != null) parquetReader.close()
batch.foreach(_.close())
batch = None
}
}
5. In this reader, I have the schema I injected, but the subscript is mapped according to the original schema
code injected HOUR_ID and USER_ID
6. Here is my call stack info
stack info
How do I read parquet's columnChunk through a custom mapping?

How do i read a delimited file inside a zip file in spark rdd

I have a HDFS location and there is a zip file inside that location
HDFS location /development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/records.zip
scala> val loc = "/development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/"
loc: String = "/development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/"
scala> val rdd = sc.textFile(loc)
rdd: org.apache.spark.rdd.RDD[String] = /development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/ MapPartitionsRDD[1] at textFile at <console>:26
scala> rdd.take(2)
res0: Array[String] = Array(PK????????]R�R��*�????�??? ???2972120.dat�S�r�0?
��*�0����?t?�]T�Ж??����
`�6ط�kU;P�M�� rSO�;G��p��?��?�Z1^3#�^�� ��F��ٕb�?~,ٖ
�u6�D��'�#�??��L*�Gp?�kcL�7!r�p1�1e�� a*.{?
�.;��������s�(�)�, ?�=�9U<"*!?5��?;�?�?�مd{h}
��gG���� �?�Z)
but it produces output differently
Can you help on how do i read a file inside a zip file using spark RDD There is only one file inside my zip file
Are you looking for something like this :
import java.io.{ IOException, FileOutputStream, FileInputStream, File }
import java.util.zip.{ ZipEntry, ZipInputStream }
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
//Unzip the file and copy the internal contents outside in new location
object Unzip extends App {
val INPUT_ZIP_FILE: String = "src/resources/my-zip.zip";
val OUTPUT_FOLDER: String = "src/resources/my-zip";
def unZipIt(zipFile: String, outputFolder: String): Unit = {
val buffer = new Array[Byte](1024)
try {
//output directory
val folder = new File(OUTPUT_FOLDER);
if (!folder.exists()) {
folder.mkdir();
}
//zip file content
val zis: ZipInputStream = new ZipInputStream(new FileInputStream(zipFile));
//get the zipped file list entry
var ze: ZipEntry = zis.getNextEntry();
while (ze != null) {
val fileName = ze.getName();
val newFile = new File(outputFolder + File.separator + fileName);
System.out.println("file unzip : " + newFile.getAbsoluteFile());
//create folders
new File(newFile.getParent()).mkdirs();
val fos = new FileOutputStream(newFile);
var len: Int = zis.read(buffer);
while (len > 0) {
fos.write(buffer, 0, len)
len = zis.read(buffer)
}
fos.close()
ze = zis.getNextEntry()
}
zis.closeEntry()
zis.close()
} catch {
case e: IOException => println("exception caught: " + e.getMessage)
}
}
Unzip.unZipIt(INPUT_ZIP_FILE, OUTPUT_FOLDER)
val sac = new SparkContext("local[*]", " first Program");
val sqlc = new SQLContext(sac);
val rdd = sac.textFile("src/resources/my-zip/sample.txt")
rdd.take(1).foreach(println)
/*val rddFromFile = sqlc.sparkContext.textFile("src/resources/my-zip/sample.txt")
println(rddFromFile.getClass)
println("##Get data Using collect")
rddFromFile.collect().foreach(f=>{
println(f)
})*/
}
Not sure if this achieves what you want to do, but may be could help a bit!

Reading excel file using apache poi in scala

I'm trying to read a excel file using poi SXSSF. For some reason sheet.rowIterator is returning empty iterator even though there are rows in the sheet. Here is the code I have
import java.io.File
import org.apache.poi.xssf.streaming.SXSSFWorkbook
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import scala.collection.JavaConverters._
class ExcelReader {
final val fileName = "c:\\temp\\data-200.xlsx"
def read(): Iterator[Contact] = {
val file = new File(fileName)
val workBook = new SXSSFWorkbook(new XSSFWorkbook(file),100)
val sheet = workBook.getSheetAt(0) //this works gets sheet name
Console.println(s"Sheet Name: ${sheet.getSheetName()}")
val rowItr = sheet.rowIterator().asScala // this is empty iterator
for (e <- rowItr) yield Contact(e.getCell(0).getStringCellValue(),
e.getCell(1).getStringCellValue(),
e.getCell(2).getStringCellValue(),
e.getCell(3).getStringCellValue(),
e.getCell(4).getStringCellValue())
}
}
Not sure what I'm doing wrong.
Here is a simple example of reading excel file that I have tried.
val myFile = new File("/home/sakoirala/Downloads/eu-historical-price-series_en.xls")
val fis = new FileInputStream(myFile)
val myWorkbook = new HSSFWorkbook(fis)
val mySheet = myWorkbook.getSheetAt(0)
val rowIterator = mySheet.iterator()
while(rowIterator.hasNext){
val row = rowIterator.next()
val cellIterator = row.cellIterator()
while(cellIterator.hasNext) {
val cell = cellIterator.next()
cell.getCellType match {
case Cell.CELL_TYPE_STRING => {
print(cell.getStringCellValue + "\t")
}
case Cell.CELL_TYPE_NUMERIC => {
print(cell.getNumericCellValue + "\t")
}
case Cell.CELL_TYPE_BOOLEAN => {
print(cell.getBooleanCellValue + "\t")
}
case Cell.CELL_TYPE_BLANK => {
print("null" + "\t")
}
case _ => throw new RuntimeException(" this error occured when reading ")
// case Cell.CELL_TYPE_FORMULA => {print(cell.getF + "\t")}
}
}
println("")
}
Hope this helps!

Insert fields in File record using Groovy

I have below code in Groovy. Basically what I'm trying is to read the set of Input records and merge them into 1 or more records with common key combination.
The Key combination is as shown below. After reading the input file, I have written the key and fields into HashMap ( see code). But now I need to check the key in the input file , if the key is seen then I have write the output record otherwise I just need to write a output record as without merging. My questions
what is the command to insert a field in Output record ?.
import java.util.Properties;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
for( int i = 0; i < dataContext.getDataCount(); i++ ) {
InputStream is = dataContext.getStream(i);
Properties props = dataContext.getProperties(i);
reader = new BufferedReader(new InputStreamReader(is));
/* This is how to declare HashMap */
def forcastMap = [:]
String Key;
String Shipfrom = "";
String Item = "";
String Fcast = "";
String Shipto = "";
String Planned_Arrival_Date = "";
String Qty = "";
String PrevKey = "";
List<String> line = null
while ((line = reader.readLine()) != null)
{
if(line.length() > 20) //Make sure it is a data line so we can do substring manipulation
{
Shipfrom = line.substring(35,12)
Item = line.substring(50,50)
Fcast = line.substring(10,50)
Shipto = line.substring(75,10)
Planned_Arrival_Date = line.substring(85,8)
Qty = line.substring(90,12)
Key = (Shipfrom + Item + Fcast + Shipto)
forcastMap.put(Key,Planned_Arrival_Date,Qty)
if key != PrevKey {
}
}
}
//dataContext.storeStream(is, props);
}

How to automatically save *txt file while open?(or copy *txt)[C# Visual studio)

Can somebody help?
At first, after button click I chose any *txt file in my PC(using openFileDialog), I open it and program should save it automatically(without file dialog) in BlaBla.txt. Or maybe make copy of *txt file.
I try many ways, all I get is empty blabla.txt
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
System.IO.StreamReader sr = new
System.IO.StreamReader(openFileDialog1.FileName);
string line = sr.ReadLine();
System.IO.StreamWriter file = new System.IO.StreamWriter("BlaBla.txt");
file.WriteLine(sr);
}
edidet code. Still not work.
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
System.IO.StreamReader sr = new
System.IO.StreamReader(openFileDialog1.FileName);
String line = reader.ReadLine();
// System.IO.StreamWriter file = new System.IO.StreamWriter("ddd.txt");
// file.Write(sr);
System.IO.StreamWriter file = new System.IO.StreamWriter("ddd.txt");
file.Write(sr);
file.Flush();
}_____________________________________________
Find another way to save, i`ts works.
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
string org, copy;
System.IO.StreamReader file = new System.IO.StreamReader(openFileDialog1.FileName);
while ((org = file.ReadLine()) != null)
{
copy = org.ToString();
using (System.IO.StreamWriter files = new System.IO.StreamWriter("blabla.txt", true))
{
files.WriteLine(copy);
}
}
file.Close();
}

Resources