lhoestq HF staff commited on
Commit
cc5f104
·
verified ·
1 Parent(s): 6bf2aa6

Update data/spark.ipynb

Browse files
Files changed (1) hide show
  1. data/spark.ipynb +4 -66
data/spark.ipynb CHANGED
@@ -1,13 +1,5 @@
1
  {
2
  "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "6fb06d81-1778-403c-b15b-d68200a5e6b5",
6
- "metadata": {},
7
- "source": [
8
- "# Spark on Hugging Face"
9
- ]
10
- },
11
  {
12
  "cell_type": "code",
13
  "execution_count": null,
@@ -21,77 +13,23 @@
21
  "spark = SparkSession.builder.appName(\"demo\").getOrCreate()"
22
  ]
23
  },
24
- {
25
- "cell_type": "markdown",
26
- "id": "8bf07f63-6fed-4cf9-8fee-5f3a5fb6bed1",
27
- "metadata": {
28
- "tags": []
29
- },
30
- "source": [
31
- "Example:\n",
32
- "\n",
33
- "```python\n",
34
- "# Load the BAAI/Infinity-Instruct dataset\n",
35
- "df = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\")\n",
36
- "\n",
37
- "# Load only one column\n",
38
- "df_langdetect_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", columns=[\"langdetect\"])\n",
39
- "\n",
40
- "# Load values within certain ranges\n",
41
- "criteria = [(\"langdetect\", \"=\", \"zh-cn\")]\n",
42
- "df_chinese_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", filters=criteria)\n",
43
- "\n",
44
- "# Save dataset\n",
45
- "write_parquet(df_chinese_only, \"hf://datasets/username/Infinity-Instruct-Chinese-Only\")\n",
46
- "```"
47
- ]
48
- },
49
- {
50
- "cell_type": "code",
51
- "execution_count": null,
52
- "id": "ca71b3ac-3291-4e4e-8fee-b3550b0426d6",
53
- "metadata": {
54
- "tags": []
55
- },
56
- "outputs": [],
57
- "source": [
58
- "from hf_spark_utils import read_parquet, write_parquet, set_session\n",
59
- "set_session(spark)"
60
- ]
61
- },
62
- {
63
- "cell_type": "markdown",
64
- "id": "07ea62a4-7549-4a75-8a12-9d830f6e3cde",
65
- "metadata": {},
66
- "source": [
67
- "#### (Optional) Login"
68
- ]
69
- },
70
  {
71
  "cell_type": "code",
72
  "execution_count": null,
73
- "id": "343b3a9a-2dce-492b-9384-703368ba3975",
74
  "metadata": {
75
  "tags": []
76
  },
77
  "outputs": [],
78
  "source": [
79
- "from huggingface_hub import notebook_login\n",
80
- "notebook_login(new_session=False)"
81
- ]
82
- },
83
- {
84
- "cell_type": "markdown",
85
- "id": "332b7609-f0eb-4703-aea6-fec3d09f5870",
86
- "metadata": {},
87
- "source": [
88
- "#### Run your code:"
89
  ]
90
  },
91
  {
92
  "cell_type": "code",
93
  "execution_count": null,
94
- "id": "6c0dfe01-9190-454c-9c52-216f74d339e1",
95
  "metadata": {},
96
  "outputs": [],
97
  "source": []
 
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
 
13
  "spark = SparkSession.builder.appName(\"demo\").getOrCreate()"
14
  ]
15
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  {
17
  "cell_type": "code",
18
  "execution_count": null,
19
+ "id": "6c0dfe01-9190-454c-9c52-216f74d339e1",
20
  "metadata": {
21
  "tags": []
22
  },
23
  "outputs": [],
24
  "source": [
25
+ "df = spark.read.format(\"huggingface\").load(repo_id)\n",
26
+ "df.show(5)"
 
 
 
 
 
 
 
 
27
  ]
28
  },
29
  {
30
  "cell_type": "code",
31
  "execution_count": null,
32
+ "id": "eec837ab-b3c6-4d4a-bc41-c63855b3af64",
33
  "metadata": {},
34
  "outputs": [],
35
  "source": []