Traitement de données massives
John Samuel
CPE Lyon
Année: 2024-2025
Courriel: john.samuel@cpe.fr
FROM ubuntu:latest MAINTAINER John Samuel RUN apt -y update && \ apt -y upgrade && \ apt -y install apache2 git RUN git clone https://github.com/johnsamuelwrites/johnsamuelwrites.github.io RUN rm -rf /var/www/html RUN mv johnsamuelwrites.github.io /var/www/html RUN echo "ServerName localhost" >>/etc/apache2/apache2.conf EXPOSE 80 CMD apachectl -D FOREGROUND
Construction de l'image docker
docker build -t johnsamuel .
Exécution de l'image docker
docker run -dit -p 8080:80 johnsamuel`
Voir le lien: http://localhost:8080/
Reconstruction de l'image docker
docker build --no-cache -t johnsamuel .
$ head /home/john/Downloads/query.csv itemLabel,year Amiga E,1993 Embarcadero Delphi,1995 Sather,1990 Microsoft Small Basic,2008 Squeak,1996 AutoIt,1999 Eiffel,1985 Eiffel,1986 Kent Recursive Calculator,1981
$ export HADOOP_HOME="..."
$ ./hive
hive> set hive.metastore.warehouse.dir=${env:HOME}/hive/warehouse;
$./hive hive> set hive.metastore.warehouse.dir=${env:HOME}/hive/warehouse; hive> create database mydb; hive> use mydb;
$./hive hive> use mydb; hive> CREATE TABLE IF NOT EXISTS proglang (name String, year int) COMMENT "Programming Languages" ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE; hive> LOAD DATA LOCAL INPATH '/home/john/Downloads/query.csv' OVERWRITE INTO TABLE proglang;
$./hive hive> SELECT * from proglang; hive> SELECT * from proglang where year > 1980;
$./hive hive> DELETE from proglang where year=1980; FAILED: SemanticException [Error 10294]: Attempt to do update or delete using transaction manager that does not support these operations.
$./hive hive> set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; hive> DELETE from proglang where year=1980; FAILED: RuntimeException [Error 10264]: To use DbTxnManager you must set hive.support.concurrency=true hive> set hive.support.concurrency=true; hive> DELETE from proglang where year=1980; FAILED: SemanticException [Error 10297]: Attempt to do update or delete on table mydb.proglang that is not transactional hive> ALTER TABLE proglang set TBLPROPERTIES ('transactional'='true') ; FAILED: Execution Error, return code 1 from i org.apache.hadoop.hive.ql.exec.DDLTask. Unable to alter table. The table must be stored using an ACID compliant format (such as ORC): mydb.proglang
$./hive hive> use mydb; hive> CREATE TABLE IF NOT EXISTS proglangorc (name String, year int) COMMENT "Programming Languages" ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS ORC; hive> LOAD DATA LOCAL INPATH '/home/john/Downloads/query.csv' OVERWRITE INTO TABLE proglangorc; FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table.
$./hive hive> insert overwrite table proglangorc select * from proglang; hive> DELETE from proglangorc where year=1980; FAILED: SemanticException [Error 10297]: Attempt to do update or delete on table mydb.proglangorc that is not transactional hive> ALTER TABLE proglangorc set TBLPROPERTIES ('transactional'='true') ; hive> DELETE from proglangorc where year=1980; hive> SELECT count(*) from proglangorc; hive> SELECT count(*) from proglangorc where year=1980;
$./pyspark >>> lines = sc.textFile("/home/john/Downloads/query.csv") >>> lineLengths = lines.map(lambda s: len(s)) >>> totalLength = lineLengths.reduce(lambda a, b: a + b) >>> print(totalLength)
$./pyspark >>> lines = sc.textFile("/home/john/Downloads/query.csv") >>> lineWordCount = lines.map(lambda s: len(s.split())) >>> totalWords = lineWordCount.reduce(lambda a, b: a + b) >>> print(totalWords)
$ export SPARK_HOME='.../spark/spark-x.x.x-bin-hadoopx.x/bin $ export PYSPARK_PYTHON=/usr/bin/python3 $ export PYSPARK_DRIVER_PYTHON=jupyter $ export PYSPARK_DRIVER_PYTHON_OPTS='notebook' $ ./pyspark
from pyspark.sql import HiveContext sqlContext = HiveContext(sc) sqlContext.sql("use default") sqlContext.sql("show tables").show()
+--------+---------+-----------+ |database|tableName|isTemporary| +--------+---------+-----------+ | default| proglang| false| | default|proglang2| false| +--------+---------+-----------+
result = sqlContext.sql("SELECT count(*) FROM proglang ") result.show() +--------+ |count(1)| +--------+ | 611| +--------+
print(type(result))
<class 'pyspark.sql.dataframe.DataFrame'>
import pandas as pd result = sqlContext.sql("SELECT count(*) as count FROM proglang ") resultFrame = result.toPandas() print(resultFrame)
|count| +-----+ | 611|
import pandas as pd result = sqlContext.sql("SELECT * FROM proglang ") resultFrame = result.toPandas() groups = resultFrame.groupby('year').count() print(groups)
import nltk nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() sentiment = sia.polarity_scores("this movie is good") print(sentiment) sentiment = sia.polarity_scores("this movie is not very good") print(sentiment) sentiment = sia.polarity_scores("this movie is bad") print(sentiment)
{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404} {'neg': 0.344, 'neu': 0.656, 'pos': 0.0, 'compound': -0.3865} {'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}