Commit 3b383b3e authored by Laxman Methuku's avatar Laxman Methuku

Spark Mongo Connector

parents
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar
This diff is collapsed.
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM https://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Maven Start Up Batch script
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM M2_HOME - location of maven2's installed home dir
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM set title of command window
title %0
@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %*
if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %*
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B
)
@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
if exist %WRAPPER_JAR% (
if "%MVNW_VERBOSE%" == "true" (
echo Found %WRAPPER_JAR%
)
) else (
if not "%MVNW_REPOURL%" == "" (
SET DOWNLOAD_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
)
if "%MVNW_VERBOSE%" == "true" (
echo Couldn't find %WRAPPER_JAR%, downloading it ...
echo Downloading from: %DOWNLOAD_URL%
)
powershell -Command "&{"^
"$webclient = new-object System.Net.WebClient;"^
"if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
"$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
"}"^
"[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^
"}"
if "%MVNW_VERBOSE%" == "true" (
echo Finished downloading %WRAPPER_JAR%
)
)
@REM End of extension
@REM Provide a "standardized" way to retrieve the CLI args that will
@REM work with both Windows and non-Windows executions.
set MAVEN_CMD_LINE_ARGS=%*
%MAVEN_JAVA_EXE% ^
%JVM_CONFIG_MAVEN_PROPS% ^
%MAVEN_OPTS% ^
%MAVEN_DEBUG_OPTS% ^
-classpath %WRAPPER_JAR% ^
"-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^
%WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat"
if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%"=="on" pause
if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE%
cmd /C exit /B %ERROR_CODE%
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.12</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>spark.mongo.kafka</groupId>
<artifactId>spark-mongo-connector</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>spark-mongo-connector</name>
<description>Demo project for Spark Mongo Connector</description>
<properties>
<java.version>1.8</java.version>
<spark.version>3.1.2</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-mongodb</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.12</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>3.0.8</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.8</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>
package com.nisum.hyd;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class SparkMongoConnectorApplication {
public static void main(String[] args) {
SpringApplication.run(SparkMongoConnectorApplication.class, args);
}
}
package com.nisum.hyd.controller;
import com.nisum.hyd.service.SparkKafkaConsumer;
import com.nisum.hyd.service.SparkKafkaProducer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequestMapping("/user")
public class UserController {
@Autowired
SparkKafkaProducer sparkKafkaProducer;
@Autowired
SparkKafkaConsumer sparkKafkaConsumer;
@GetMapping("/getUserAndCardDataDetailsPushToKafka")
public String getUserAndCardDataDetailsPushToKafka() {
sparkKafkaProducer.getUserAndCardDataDetailsPushToKafka();
return "User and Card data retrieved successfully";
}
@GetMapping("/getUserDetailsAndSaveToDB")
public String getUserDetailsAndSaveToDB(){
return sparkKafkaConsumer.getUserDetailsAndSaveToDB();
}
}
package com.nisum.hyd.dto;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class UserCardDTO {
private String userId;
private String userName;
private String regMobileNo;
private String gender;
private String cardType;
private String cardNumber;
private String cardStatus;
}
package com.nisum.hyd.entity;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.mongodb.core.mapping.Document;
@Data
@NoArgsConstructor
@AllArgsConstructor
@Document("usercardinfo")
public class UserCardInfo {
private String cardId;
private String userId;
private String cardType;
private String cardNumber;
private String cardStatus;
}
package com.nisum.hyd.entity;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.mongodb.core.mapping.Document;
@Data
@NoArgsConstructor
@AllArgsConstructor
@Document("userinfo")
public class UserInfo {
private String userId;
private String userName;
private String mobileNo;
private String gender;
}
package com.nisum.hyd.service;
import com.mongodb.client.MongoClient;
import com.mongodb.client.MongoClients;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.nisum.hyd.dto.UserCardDTO;
import org.apache.log4j.Logger;
import org.apache.spark.sql.*;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.StructType;
import org.bson.Document;
import org.springframework.stereotype.Service;
import java.util.concurrent.TimeoutException;
@Service
public class SparkKafkaConsumer {
private static final Logger LOGGER = Logger.getLogger(SparkKafkaConsumer.class);
public static final String MONGODB_DOCUMENT_COLLECTION = "mongodb://127.0.0.1/userdetailsdb.userdetails";
public static final String MONGODB_URI = "mongodb://localhost:27017";
private static final String KafkaBrokerEndpoint = "localhost:9092";
private static String KafkaTopic = "UserWithCardDetails";
public String getUserDetailsAndSaveToDB() {
LOGGER.info("getUserDataAndSave() calling");
SparkSession sparkSession = SparkSession.builder()
.master("local[*]")
.appName("MongoSparkConnector")
.config("spark.mongodb.input.uri", MONGODB_DOCUMENT_COLLECTION)
.config("spark.mongodb.output.uri", MONGODB_DOCUMENT_COLLECTION)
.getOrCreate();
Dataset<UserCardDTO> userDataset = getFilteredData(sparkSession);
try {
saveToDB(userDataset);
} catch (Exception e) {
LOGGER.error("Failed to write into database in getUserDataAndSave(): {}");
e.printStackTrace();
}
//Dataset Saving into MongoDB without Streaming
/*try {
MongoSpark.save(userDataset);
} catch (Exception e) {
LOGGER.error("Failed to write into database in getUserDataAndSave(): {}");
e.printStackTrace();
}*/
return "Data Saved in DB";
}
Dataset<UserCardDTO> getFilteredData(SparkSession sparkSession) {
LOGGER.info("getFilteredData() calling");
Dataset<Row> ds = sparkSession.readStream().format("kafka")
.option("kafka.bootstrap.servers", KafkaBrokerEndpoint)
.option("subscribe", KafkaTopic)
.option("header", "true")
.option("startingOffsets", "earliest")
.load();
StructType structType = Encoders.bean(UserCardDTO.class).schema();
Dataset<Row> rowDataset = ds
.selectExpr("CAST(value as String) as message")
.select(functions.from_json(functions.col("message"), structType).as("userData"))
.select("userData.*").distinct();
rowDataset.show();
Dataset<Row> filteredData = rowDataset.filter(functions.col("cardType").equalTo("Visa"));
Dataset<UserCardDTO> productDataset = filteredData.as(Encoders.bean(UserCardDTO.class));
productDataset.show();
return productDataset;
}
void saveToDB(Dataset<UserCardDTO> userDataset) throws TimeoutException, StreamingQueryException {
LOGGER.info("saveToDB() calling");
StreamingQuery query = userDataset.writeStream().outputMode("append")
.option("checkpointLocation", "checkPointLocation/streamingJob")
.foreach(new ForeachWriter<UserCardDTO>() {
private MongoClient mongoClient;
private MongoDatabase database;
private MongoCollection<Document> collection;
@Override
public boolean open(long partitionId, long epochId) {
mongoClient = MongoClients.create(MONGODB_URI);
database = mongoClient.getDatabase("userdetailsdb");
collection = database.getCollection("userdetails");
return true;
}
@Override
public void process(UserCardDTO userCardDTO) {
Document document = new Document();
document.append("userId", userCardDTO.getUserId());
document.append("userName", userCardDTO.getUserName());
document.append("regMobileNo", userCardDTO.getRegMobileNo());
document.append("gender", userCardDTO.getGender());
document.append("cardType", userCardDTO.getCardType());
document.append("cardNumber", userCardDTO.getCardNumber());
document.append("cardStatus", userCardDTO.getCardStatus());
collection.insertOne(document);
}
@Override
public void close(Throwable errorOrNull) {
mongoClient.close();
}
}).start();
query.awaitTermination();
}
}
package com.nisum.hyd.service;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.storage.StorageLevel;
import org.springframework.stereotype.Service;
@Service
public class SparkKafkaProducer {
private static final Logger LOGGER = Logger.getLogger(SparkKafkaProducer.class);
public static final String MONGODB_DOCUMENT_COLLECTION = "mongodb://127.0.0.1/usersdb";
private static final String KafkaBrokerEndpoint = "localhost:9092";
private static final String MONGODB_DATASOURCE = "com.mongodb.spark.sql.DefaultSource";
private static String KafkaTopic = "UserWithCardDetails";
public String getUserAndCardDataDetailsPushToKafka() {
LOGGER.info("getUserCardDataAndPushToKafka() calling");
SparkSession sparkSession = SparkSession.builder()
.master("local[*]")
.appName("MongoSparkConnector")
.config("spark.mongodb.input.uri", MONGODB_DOCUMENT_COLLECTION)
.config("spark.mongodb.output.uri", MONGODB_DOCUMENT_COLLECTION)
.getOrCreate();
Dataset<Row> usersInfo = sparkSession.read().format(MONGODB_DATASOURCE)
.option("database", "usersdb").
option("collection", "userinfo").load();
usersInfo.toDF().show();
Dataset<Row> userCardInfo = sparkSession.read().format(MONGODB_DATASOURCE)
.option("database", "usersdb").
option("collection", "usercardinfo").load();
userCardInfo.toDF().show();
Dataset<Row> userAndCardDetails = usersInfo
.join(userCardInfo, usersInfo.colRegex("_id")
.equalTo(userCardInfo.colRegex("userId")), "inner");
userAndCardDetails.show();
sendToKafka(userAndCardDetails);
return "Data Pushed to Kafka Successfully";
}
public void sendToKafka(Dataset<Row> userCardDetails) {
Dataset<Row> KafkaPublish = userCardDetails
.withColumn("value", functions.to_json(functions.struct(functions.col("userId"),
functions.col("userName"), functions.col("regMobileNo"),
functions.col("gender"),
functions.col("cardStatus"),
functions.col("cardNumber"),
functions.col("cardType"))))
.alias("value").select("value")
.persist(StorageLevel.MEMORY_AND_DISK());
try {
KafkaPublish.write().format("kafka")
.option("kafka.bootstrap.servers", KafkaBrokerEndpoint)
.option("topic", KafkaTopic)
.save();
} catch (Exception e) {
LOGGER.error("Error occurred in sendToKafka() while publishing to Kafka");
e.getMessage();
}
}
}
package com.nisum.hyd;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
@SpringBootTest
class SparkMongoConnectorApplicationTests {
@Test
void contextLoads() {
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment