chore: Spark POC Case #1 - Initial Commit

parents
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.2/apache-maven-3.9.2-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar
This diff is collapsed.
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM https://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Apache Maven Wrapper startup batch script, version 3.2.0
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM set title of command window
title %0
@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %*
if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %*
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
set WRAPPER_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
IF "%%A"=="wrapperUrl" SET WRAPPER_URL=%%B
)
@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
if exist %WRAPPER_JAR% (
if "%MVNW_VERBOSE%" == "true" (
echo Found %WRAPPER_JAR%
)
) else (
if not "%MVNW_REPOURL%" == "" (
SET WRAPPER_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
)
if "%MVNW_VERBOSE%" == "true" (
echo Couldn't find %WRAPPER_JAR%, downloading it ...
echo Downloading from: %WRAPPER_URL%
)
powershell -Command "&{"^
"$webclient = new-object System.Net.WebClient;"^
"if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
"$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
"}"^
"[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%WRAPPER_URL%', '%WRAPPER_JAR%')"^
"}"
if "%MVNW_VERBOSE%" == "true" (
echo Finished downloading %WRAPPER_JAR%
)
)
@REM End of extension
@REM If specified, validate the SHA-256 sum of the Maven wrapper jar file
SET WRAPPER_SHA_256_SUM=""
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
IF "%%A"=="wrapperSha256Sum" SET WRAPPER_SHA_256_SUM=%%B
)
IF NOT %WRAPPER_SHA_256_SUM%=="" (
powershell -Command "&{"^
"$hash = (Get-FileHash \"%WRAPPER_JAR%\" -Algorithm SHA256).Hash.ToLower();"^
"If('%WRAPPER_SHA_256_SUM%' -ne $hash){"^
" Write-Output 'Error: Failed to validate Maven wrapper SHA-256, your Maven wrapper might be compromised.';"^
" Write-Output 'Investigate or delete %WRAPPER_JAR% to attempt a clean download.';"^
" Write-Output 'If you updated your Maven version, you need to update the specified wrapperSha256Sum property.';"^
" exit 1;"^
"}"^
"}"
if ERRORLEVEL 1 goto error
)
@REM Provide a "standardized" way to retrieve the CLI args that will
@REM work with both Windows and non-Windows executions.
set MAVEN_CMD_LINE_ARGS=%*
%MAVEN_JAVA_EXE% ^
%JVM_CONFIG_MAVEN_PROPS% ^
%MAVEN_OPTS% ^
%MAVEN_DEBUG_OPTS% ^
-classpath %WRAPPER_JAR% ^
"-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^
%WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat"
if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%"=="on" pause
if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE%
cmd /C exit /B %ERROR_CODE%
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.0</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.nisum</groupId>
<artifactId>product-info-api</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>product-info-api</name>
<description>Demo project for Spring Boot</description>
<properties>
<spark.version>3.2.1</spark.version>
<scala.version.major>2.12</scala.version.major>
<scala.version.minor>11</scala.version.minor>
<scala.version>${scala.version.major}.${scala.version.minor}</scala.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<springframework.version>5.3.24</springframework.version>
<kafka.version>2.6.0</kafka.version>
<zookeeper.version>3.5.8</zookeeper.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-webflux</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/io.projectreactor.kafka/reactor-kafka -->
<dependency>
<groupId>io.projectreactor.kafka</groupId>
<artifactId>reactor-kafka</artifactId>
<version>1.1.2.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>io.projectreactor</groupId>
<artifactId>reactor-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-mongodb-reactive</artifactId>
</dependency>
<dependency>
<groupId>de.flapdoodle.embed</groupId>
<artifactId>de.flapdoodle.embed.mongo</artifactId>
<version>4.7.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct</artifactId>
<version>1.5.3.Final</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver-reactivestreams</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver-sync</artifactId>
<version>4.0.5</version>
</dependency>
<!-- Spark Streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.codehaus.janino/janino -->
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.16</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<annotationProcessorPaths>
<path>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct-processor</artifactId>
<version>1.5.3.Final</version>
</path>
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
</path>
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok-mapstruct-binding</artifactId>
<version>0.2.0</version>
</path>
</annotationProcessorPaths>
</configuration>
</plugin>
</plugins>
</build>
</project>
package com.nisum.product;
import org.apache.spark.sql.SparkSession;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.WebApplicationType;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.context.annotation.Bean;
@SpringBootApplication
public class ProductInfoApiApplication {
public static void main(String[] args) {
new SpringApplicationBuilder(ProductInfoApiApplication.class)
.web(WebApplicationType.REACTIVE)
.run(args);
}
}
package com.nisum.product.controller;
import com.nisum.product.dto.KafkaPublisherResponse;
import com.nisum.product.dto.ProductOffers;
import com.nisum.product.service.ProductsOfferService;
import lombok.extern.slf4j.Slf4j;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.codec.multipart.FilePart;
import org.springframework.web.bind.annotation.*;
import reactor.core.publisher.Flux;
import java.util.concurrent.TimeoutException;
@Slf4j
@RestController
@RequestMapping(path = "/product")
public class ProductController {
@Autowired
private ProductsOfferService productsOfferService;
@PostMapping(value = "/upload-data", consumes = MediaType.MULTIPART_FORM_DATA_VALUE, produces = MediaType.APPLICATION_JSON_VALUE)
@ResponseStatus(value = HttpStatus.OK)
public Flux<KafkaPublisherResponse> upload(@RequestPart("files") Flux<FilePart> filePartFlux){
return productsOfferService.saveAndProcess(filePartFlux);
}
@GetMapping(value = "/get", consumes = MediaType.MULTIPART_FORM_DATA_VALUE, produces = MediaType.APPLICATION_JSON_VALUE)
@ResponseStatus(value = HttpStatus.OK)
public Flux<ProductOffers> getProductData(@RequestParam("batchId") String batchId) throws TimeoutException, StreamingQueryException {
return productsOfferService.getProductOffers(batchId);
}
}
package com.nisum.product.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class FileUploadAttributes {
private String fileName;
private String fileContent;
}
package com.nisum.product.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class KafkaPublisherResponse {
private String topicName;
private String batchId;
private String status;
}
package com.nisum.product.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Product {
private String id;
private String name;
private String description;
private Double price;
}
package com.nisum.product.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class ProductOffers {
private String id;
private String name;
private String description;
private Double price;
private Double discountPercentage;
private Double offerPrice;
private String uniqueId;
private String batchId;
}
package com.nisum.product.service;
import com.nisum.product.dto.FileUploadAttributes;
import com.nisum.product.dto.KafkaPublisherResponse;
import com.nisum.product.dto.ProductOffers;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.sql.*;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.streaming.Trigger;
import org.apache.spark.sql.types.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.buffer.DataBufferUtils;
import org.springframework.http.codec.multipart.FilePart;
import org.springframework.stereotype.Service;
import reactor.core.publisher.Flux;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.TimeoutException;
@Slf4j
@Service
public class ProductsOfferService {
@Autowired
SparkSession sparkSession;
public Flux<KafkaPublisherResponse> saveAndProcess(Flux<FilePart> filePartFlux) {
return filePartFlux.flatMap(filePart ->
filePart.content().map(dataBuffer -> {
byte[] bytes = new byte[dataBuffer.capacity()];
dataBuffer.read(bytes);
DataBufferUtils.release(dataBuffer);
return FileUploadAttributes.builder()
.fileContent(new String(bytes, StandardCharsets.UTF_8))
.fileName(filePart.filename())
.build();
})
.map(content -> {
return saveAsFile(content);
})
.map(fileName -> {
return readAndPublishDataToKafka(fileName);
})
.map(kafkaPublisherResponse -> kafkaPublisherResponse));
}
private String saveAsFile(FileUploadAttributes fileUploadAttributes) {
try {
FileUtils.deleteDirectory(new File("src/main/resources/data/CheckPointLocation"));
} catch (IOException e) {
log.error("Error while deleting checkpoint location dir {}", e.getMessage());
}
String batchId= new SimpleDateFormat("HHmmssSSS").format(new Date());
String dir="src/main/resources/data/"+batchId;
String fileName = dir+ File.separator + fileUploadAttributes.getFileName();
try {
FileUtils.forceMkdir(new File(dir));
FileUtils.writeStringToFile(new File(fileName)
, fileUploadAttributes.getFileContent(), StandardCharsets.UTF_8, true);
return batchId;
} catch (IOException e) {
if (e instanceof IOException)
log.error("Error while creating CSV... {}", e.getMessage());
return StringUtils.EMPTY;
}
}
private KafkaPublisherResponse readAndPublishDataToKafka(String batchId) {
if (StringUtils.isNotEmpty(batchId)) {
String topicName = "test-product1";
try {
populateProductsOfferPriceUsingDFStruct(batchId, topicName);
return KafkaPublisherResponse.builder()
.topicName(topicName)
.batchId(batchId)
.status("File data pushed to Kafka topic").build();
} catch (TimeoutException | StreamingQueryException e) {
if (e instanceof TimeoutException)
log.error("TimeoutException... {}", e.getMessage());
if (e instanceof StreamingQueryException)
log.error("StreamingQueryException {}", e.getMessage());
return KafkaPublisherResponse.builder().topicName(topicName).status(e.getMessage()).build();
}
}else
{
return KafkaPublisherResponse.builder().build();
}
}
public void populateProductsOfferPriceUsingDF()
{
sparkSession.sqlContext().udf()
.register("calculateOfferPrice",(Double price, Double discountPercentage)->
((100-discountPercentage)*price)/100, DataTypes.DoubleType);
Dataset<Row> df=sparkSession.read().csv("src/main/resources/data/products_without_header.csv").toDF();
df.withColumn("discount(%)",functions.lit(10.50))
.withColumn("OfferPrice",functions.callUDF("calculateOfferPrice",
functions.col("_c3").cast(DataTypes.DoubleType)
,functions.col("discount(%)").cast(DataTypes.DoubleType)))
.write()
.mode(SaveMode.Overwrite)
.csv("src/main/resources/result/df/product_offer_price");
}
public void populateProductsOfferPriceUsingDFStruct(String batchId,String topicName) throws TimeoutException, StreamingQueryException {
sparkSession.sqlContext().udf()
.register("calculateOfferPrice",(Double price, Double discountPercentage)->
((100-discountPercentage)*price)/100, DataTypes.DoubleType);
StructField[] structFields= new StructField[4];
structFields[0]= new StructField("id",DataTypes.StringType,true, Metadata.empty());
structFields[1]= new StructField("name",DataTypes.StringType,true,Metadata.empty());
structFields[2]= new StructField("description",DataTypes.StringType,true,Metadata.empty());
structFields[3]= new StructField("price",DataTypes.DoubleType,true,Metadata.empty());
StructType structType= new StructType(structFields);
Dataset<Row> df=sparkSession.readStream()
.format("csv")
.option("header",true)
.schema(structType)
.load("src/main/resources/data/"+batchId);
Dataset<Row> rowTransformDf=df.withColumn("discountPercentage",functions.lit(10.50))
.withColumn("offerPrice",functions.callUDF("calculateOfferPrice",
functions.col("price").cast(DataTypes.DoubleType)
,functions.col("discountPercentage").cast(DataTypes.DoubleType)))
.withColumn("uniqueId", functions.lit(UUID.randomUUID().toString()))
.withColumn("batchId",functions.lit(batchId));
rowTransformDf.selectExpr("CAST(BatchId AS STRING) AS key", "to_json(struct(*)) AS value")
.writeStream()
.format("kafka")
.trigger(Trigger.Once())
.option("kafka.bootstrap.servers", "HYD-LAP-00484.corp.nisum.com:9092")
.option("topic", topicName)
.option("checkpointLocation", "src/main/resources/data/CheckPointLocation")
.start()
.awaitTermination(5000);
}
public Flux<ProductOffers> getProductOffers(String batchId) throws TimeoutException, StreamingQueryException {
Dataset<Row> df = sparkSession
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "HYD-LAP-00484.corp.nisum.com:9092")
.option("subscribe", "test-product1")
.option("startingOffsets", "earliest")
.option("startingOffsets", "earliest")
.option("failOnDataLoss", "false").
option("kafka.con.group.id", "localStream")
.load();
StructType productOffers = Encoders.bean(ProductOffers.class).schema();
Dataset<ProductOffers> offersDataset=df.selectExpr("CAST(value as String) as message")
.select(functions.from_json(functions.col("message"),productOffers ).as("data"))
.filter(functions.col("data.batchId").equalTo(batchId))
.filter(functions.col("data.name").contains("Apple"))
.filter(functions.col("data.offerPrice").geq(20000))
.select("data.*")
.coalesce(1)
.as(Encoders.bean(ProductOffers.class));
StreamingQuery streamingQuery=offersDataset
.writeStream()
.trigger(Trigger.Once())
.format("csv")
.option("checkpointLocation", "src/main/resources/data/checkpointLocation/StreamingJob")
.option("path","src/main/resources/data/"+batchId)
.outputMode(OutputMode.Append())
.start();
streamingQuery.awaitTermination();
return Flux.empty();
}
}
package com.nisum.product.utils;
import org.apache.spark.sql.SparkSession;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class SparkConfiguaration {
@Bean
public SparkSession sparkSession()
{
return SparkSession
.builder()
.appName("Product info API")
.master("local[1]")
.getOrCreate();
}
}
spring.http.multipart.max-file-size=2GB
spring.http.multipart.max-request-size=2GB
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment