import mermaid from 'https://cdnjs.cloudflare.com/ajax/libs/mermaid/10.2.3/mermaid.esm.min.mjs'; mermaid.initialize({ startOnLoad: true });
from jyquickhelper import add_notebook_menu
add_notebook_menu()
from pandas import DataFrame
df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
df
X | Y | |
---|---|---|
0 | 4.5 | a |
1 | 6.0 | b |
2 | 7.0 | c |
We create a streaming dataframe:
from pandas_streaming.df import StreamingDataFrame
sdf = StreamingDataFrame.read_df(df)
sdf
<pandas_streaming.df.dataframe.StreamingDataFrame at 0x15c2c606160>
sdf.to_dataframe()
X | Y | |
---|---|---|
0 | 4.5 | a |
1 | 6.0 | b |
2 | 7.0 | c |
Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as pandas.DataFrame possibly wherever it is possible to manipulate data without loading everything into memory.
sdf2 = sdf.concat(sdf)
sdf2.to_dataframe()
X | Y | |
---|---|---|
0 | 4.5 | a |
1 | 6.0 | b |
2 | 7.0 | c |
0 | 4.5 | a |
1 | 6.0 | b |
2 | 7.0 | c |
m = DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
m
Y | Z | |
---|---|---|
0 | a | 10 |
1 | b | 20 |
sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
sdf3.to_dataframe()
X | Y | Z | |
---|---|---|---|
0 | 4.5 | a | 10.0 |
1 | 6.0 | b | 20.0 |
2 | 7.0 | c | NaN |
0 | 4.5 | a | 10.0 |
1 | 6.0 | b | 20.0 |
2 | 7.0 | c | NaN |
sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer")
X | Y | Z | |
---|---|---|---|
0 | 4.5 | a | 10.0 |
1 | 4.5 | a | 10.0 |
2 | 6.0 | b | 20.0 |
3 | 6.0 | b | 20.0 |
4 | 7.0 | c | NaN |
5 | 7.0 | c | NaN |
The order might be different.
sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
sdfte.head()
X | Y | |
---|---|---|
0 | 4.5 | a |
1 | 4.5 | a |
sdftr.head()
X | Y | |
---|---|---|
0 | 6.0 | b |
1 | 7.0 | c |
2 | 6.0 | b |
0 | 7.0 | c |
sdf2.to_csv("example.txt")
'example.txt'
new_sdf = StreamingDataFrame.read_csv("example.txt")
new_sdf.train_test_split("example.{}.txt", streaming=False)
['example.train.txt', 'example.test.txt']
import glob
glob.glob("ex*.txt")
['example.test.txt', 'example.train.txt', 'example.txt']