DataFrame:通过SparkSql将scala类转为DataFrame的方法


Posted in Python onJanuary 29, 2019

如下所示:

import java.text.DecimalFormat
import com.alibaba.fastjson.JSON
import com.donews.data.AppConfig
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Row, SaveMode, DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
 
/**
 * Created by silentwolf on 2016/6/3.
 */
 
case class UserTag(SUUID: String,
     MAN: Float,
     WOMAN: Float,
     AGE10_19: Float,
     AGE20_29: Float,
     AGE30_39: Float,
     AGE40_49: Float,
     AGE50_59: Float,
     GAME: Float,
     MOVIE: Float,
     MUSIC: Float,
     ART: Float,
     POLITICS_NEWS: Float,
     FINANCIAL: Float,
     EDUCATION_TRAINING: Float,
     HEALTH_CARE: Float,
     TRAVEL: Float,
     AUTOMOBILE: Float,
     HOUSE_PROPERTY: Float,
     CLOTHING_ACCESSORIES: Float,
     BEAUTY: Float,
     IT: Float,
     BABY_PRODUCT: Float,
     FOOD_SERVICE: Float,
     HOME_FURNISHING: Float,
     SPORTS: Float,
     OUTDOOR_ACTIVITIES: Float,
     MEDICINE: Float
     )
 
object UserTagTable {
 
 val LOG = LoggerFactory.getLogger(UserOverviewFirst.getClass)
 
 val REP_HOME = s"${AppConfig.HDFS_MASTER}/${AppConfig.HDFS_REP}"
 
 def main(args: Array[String]) {
 
 var startTime = System.currentTimeMillis()
 
 val conf: com.typesafe.config.Config = ConfigFactory.load()
 
 val sc = new SparkContext()
 
 val sqlContext = new SQLContext(sc)
 
 var df1: DataFrame = null
 
 if (args.length == 0) {
  println("请输入: appkey , StartTime : 2016-04-10 ,StartEnd :2016-04-11")
 }
 else {
 
  var appkey = args(0)
 
  var lastdate = args(1)
 
  df1 = loadDataFrame(sqlContext, appkey, "2016-04-10", lastdate)
 
  df1.registerTempTable("suuidTable")
 
  sqlContext.udf.register("taginfo", (a: String) => userTagInfo(a))
  sqlContext.udf.register("intToString", (b: Long) => intToString(b))
  import sqlContext.implicits._
 
  //***重点***:将临时表中的suuid和自定函数中Json数据,放入UserTag中。
 sqlContext.sql(" select distinct(suuid) AS suuid,taginfo(suuid) from suuidTable group by suuid").map { case Row(suuid: String, taginfo: String) =>
  val taginfoObj = JSON.parseObject(taginfo)
  UserTag(suuid.toString,
   taginfoObj.getFloat("man"),
   taginfoObj.getFloat("woman"),
   taginfoObj.getFloat("age10_19"),
   taginfoObj.getFloat("age20_29"),
   taginfoObj.getFloat("age30_39"),
   taginfoObj.getFloat("age40_49"),
   taginfoObj.getFloat("age50_59"),
   taginfoObj.getFloat("game"),
   taginfoObj.getFloat("movie"),
   taginfoObj.getFloat("music"),
   taginfoObj.getFloat("art"),
   taginfoObj.getFloat("politics_news"),
   taginfoObj.getFloat("financial"),
   taginfoObj.getFloat("education_training"),
   taginfoObj.getFloat("health_care"),
   taginfoObj.getFloat("travel"),
   taginfoObj.getFloat("automobile"),
   taginfoObj.getFloat("house_property"),
   taginfoObj.getFloat("clothing_accessories"),
   taginfoObj.getFloat("beauty"),
   taginfoObj.getFloat("IT"),
   taginfoObj.getFloat("baby_Product"),
   taginfoObj.getFloat("food_service"),
   taginfoObj.getFloat("home_furnishing"),
   taginfoObj.getFloat("sports"),
   taginfoObj.getFloat("outdoor_activities"),
   taginfoObj.getFloat("medicine")
  )}.toDF().registerTempTable("resultTable")
 
  val resultDF = sqlContext.sql(s"select '$appkey' AS APPKEY, '$lastdate' AS DATE,SUUID ,MAN,WOMAN,AGE10_19,AGE20_29,AGE30_39 ," +
  "AGE40_49 ,AGE50_59,GAME,MOVIE,MUSIC,ART,POLITICS_NEWS,FINANCIAL,EDUCATION_TRAINING,HEALTH_CARE,TRAVEL,AUTOMOBILE," +
  "HOUSE_PROPERTY,CLOTHING_ACCESSORIES,BEAUTY,IT,BABY_PRODUCT ,FOOD_SERVICE ,HOME_FURNISHING ,SPORTS ,OUTDOOR_ACTIVITIES ," +
  "MEDICINE from resultTable WHERE SUUID IS NOT NULL")
  resultDF.write.mode(SaveMode.Overwrite).options(
  Map("table" -> "USER_TAGS", "zkUrl" -> conf.getString("Hbase.url"))
  ).format("org.apache.phoenix.spark").save()
 
 }
 }
 
 def intToString(suuid: Long): String = {
 suuid.toString()
 }
 
 def userTagInfo(num1: String): String = {
 
 var de = new DecimalFormat("0.00")
 var mannum = de.format(math.random).toFloat
 var man = mannum
 var woman = de.format(1 - mannum).toFloat
 
 var age10_19num = de.format(math.random * 0.2).toFloat
 var age20_29num = de.format(math.random * 0.2).toFloat
 var age30_39num = de.format(math.random * 0.2).toFloat
 var age40_49num = de.format(math.random * 0.2).toFloat
 
 var age10_19 = age10_19num
 var age20_29 = age20_29num
 var age30_39 = age30_39num
 var age40_49 = age40_49num
 var age50_59 = de.format(1 - age10_19num - age20_29num - age30_39num - age40_49num).toFloat
 
 var game = de.format(math.random * 1).toFloat
 var movie = de.format(math.random * 1).toFloat
 var music = de.format(math.random * 1).toFloat
 var art = de.format(math.random * 1).toFloat
 var politics_news = de.format(math.random * 1).toFloat
 
 var financial = de.format(math.random * 1).toFloat
 var education_training = de.format(math.random * 1).toFloat
 var health_care = de.format(math.random * 1).toFloat
 var travel = de.format(math.random * 1).toFloat
 var automobile = de.format(math.random * 1).toFloat
 
 var house_property = de.format(math.random * 1).toFloat
 var clothing_accessories = de.format(math.random * 1).toFloat
 var beauty = de.format(math.random * 1).toFloat
 var IT = de.format(math.random * 1).toFloat
 var baby_Product = de.format(math.random * 1).toFloat
 
 var food_service = de.format(math.random * 1).toFloat
 var home_furnishing = de.format(math.random * 1).toFloat
 var sports = de.format(math.random * 1).toFloat
 var outdoor_activities = de.format(math.random * 1).toFloat
 var medicine = de.format(math.random * 1).toFloat
 
 "{" + "\"man\"" + ":" + man + "," + "\"woman\"" + ":" + woman + "," + "\"age10_19\"" + ":" + age10_19 + "," + "\"age20_29\"" + ":" + age20_29 + "," +
  "\"age30_39\"" + ":" + age30_39 + "," + "\"age40_49\"" + ":" + age40_49 + "," + "\"age50_59\"" + ":" + age50_59 + "," + "\"game\"" + ":" + game + "," +
  "\"movie\"" + ":" + movie + "," + "\"music\"" + ":" + music + "," + "\"art\"" + ":" + art + "," + "\"politics_news\"" + ":" + politics_news + "," +
  "\"financial\"" + ":" + financial + "," + "\"education_training\"" + ":" + education_training + "," + "\"health_care\"" + ":" + health_care + "," +
  "\"travel\"" + ":" + travel + "," + "\"automobile\"" + ":" + automobile + "," + "\"house_property\"" + ":" + house_property + "," + "\"clothing_accessories\"" + ":" + clothing_accessories + "," +
  "\"beauty\"" + ":" + beauty + "," + "\"IT\"" + ":" + IT + "," + "\"baby_Product\"" + ":" + baby_Product + "," + "\"food_service\"" + ":" + food_service + "," +
  "\"home_furnishing\"" + ":" + home_furnishing + "," + "\"sports\"" + ":" + sports + "," + "\"outdoor_activities\"" + ":" + outdoor_activities + "," + "\"medicine\"" + ":" + medicine +
  "}";
 
 }
 
 def loadDataFrame(ctx: SQLContext, appkey: String, startDay: String, endDay: String): DataFrame = {
 val path = s"$REP_HOME/appstatistic"
 ctx.read.parquet(path)
  .filter(s"timestamp is not null and appkey='$appkey' and day>='$startDay' and day<='$endDay'")
 }
 
 
}

以上这篇DataFrame:通过SparkSql将scala类转为DataFrame的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python文件处理
Feb 29 Python
浅谈python对象数据的读写权限
Sep 12 Python
Flask框架的学习指南之制作简单blog系统
Nov 20 Python
django基础之数据库操作方法(详解)
May 24 Python
python利用Tesseract识别验证码的方法示例
Jan 21 Python
Django中提示消息messages的设置方式
Nov 15 Python
Python爬虫解析网页的4种方式实例及原理解析
Dec 30 Python
python实现图像拼接
Mar 05 Python
Python 面向对象部分知识点小结
Mar 09 Python
基于python实现matlab filter函数过程详解
Jun 08 Python
如何利用python读取micaps文件详解
Oct 18 Python
详解Python中openpyxl模块基本用法
Feb 23 Python
pandas去除重复列的实现方法
Jan 29 #Python
使用Python向C语言的链接库传递数组、结构体、指针类型的数据
Jan 29 #Python
pandas去重复行并分类汇总的实现方法
Jan 29 #Python
spark dataframe 将一列展开,把该列所有值都变成新列的方法
Jan 29 #Python
Python使用ctypes调用C/C++的方法
Jan 29 #Python
dataframe 按条件替换某一列中的值方法
Jan 29 #Python
Numpy之random函数使用学习
Jan 29 #Python
You might like
PHP中利用substr_replace将指定两位置之间的字符替换为*号
2011/01/27 PHP
php堆排序(heapsort)练习
2013/11/13 PHP
PHP动态编译出现Cannot find autoconf的解决方法
2014/11/05 PHP
Mac OS下配置PHP+MySql环境
2015/02/25 PHP
关于跨站脚本攻击问题
2011/12/22 Javascript
js判断样式className同时增加class或删除class
2013/01/30 Javascript
js如何获取object类型里的键值
2014/02/18 Javascript
调用DOM对象的focus使文本框获得焦点
2014/02/19 Javascript
js设置文本框中焦点位置在最后的示例代码(简单实用)
2014/03/04 Javascript
Node.js连接postgreSQL并进行数据操作
2016/12/18 Javascript
jQuery模拟淘宝购物车功能
2017/02/27 Javascript
javascript浅层克隆、深度克隆对比及实例解析
2020/02/09 Javascript
jQuery实现简单评论功能
2020/08/19 jQuery
three.js显示中文字体与tween应用详析
2021/01/04 Javascript
[05:08]第一届“网鱼杯”DOTA2比赛精彩集锦
2014/09/05 DOTA
[00:56]跨越时空加入战场 全新祈求者身心“失落奇艺侍祭”展示
2019/07/20 DOTA
[01:07:11]Secret vs Newbee 2019国际邀请赛小组赛 BO2 第二场 8.15
2019/08/17 DOTA
Python获取脚本所在目录的正确方法
2014/04/15 Python
Python判断变量是否已经定义的方法
2014/08/18 Python
Flask框架web开发之零基础入门
2018/12/10 Python
详解js文件通过python访问数据库方法
2019/03/03 Python
wxPython:python首选的GUI库实例分享
2019/10/05 Python
python 下载文件的几种方法汇总
2021/01/06 Python
澳大利亚家具和家居用品在线商店:Interiors Online
2018/03/05 全球购物
Vita Fede官网:在意大利手工制作,在纽约市设计
2019/10/25 全球购物
创建服务型党组织实施方案
2014/02/25 职场文书
我的祖国演讲稿
2014/05/04 职场文书
访谈节目策划方案
2014/05/15 职场文书
奥林匹克的口号
2014/06/13 职场文书
四风个人对照检查材料思想汇报
2014/09/25 职场文书
2014年电工工作总结
2014/11/20 职场文书
幼儿园个人总结
2015/02/28 职场文书
代理词怎么写
2015/05/25 职场文书
史上最全的军训拉歌口号
2015/12/25 职场文书
爱心捐款倡议书:点燃希望,传递温暖
2019/11/04 职场文书
MySQL数据库完全卸载的方法
2022/03/03 MySQL