Skip to content

Commit 08f6194

Browse files
committed
Import new random station generation
1 parent 148ef15 commit 08f6194

File tree

4 files changed

+148
-567
lines changed

4 files changed

+148
-567
lines changed

create_measurements3.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@
1616
#
1717

1818

19-
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements3 $1
19+
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements3 $1 $2

src/main/java/dev/morling/onebrc/CreateMeasurements3.java

Lines changed: 27 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -15,152 +15,57 @@
1515
*/
1616
package dev.morling.onebrc;
1717

18-
import java.io.BufferedReader;
1918
import java.io.BufferedWriter;
20-
import java.io.FileReader;
2119
import java.io.FileWriter;
22-
import java.io.IOException;
23-
import java.io.StringReader;
24-
import java.nio.charset.StandardCharsets;
25-
import java.util.ArrayList;
26-
import java.util.HashSet;
27-
import java.util.concurrent.ThreadLocalRandom;
2820

29-
public class CreateMeasurements3 {
21+
import org.rschwietzke.FastRandom;
3022

31-
public static final int MAX_NAME_LEN = 100;
32-
public static final int KEYSET_SIZE = 10_000;
23+
public class CreateMeasurements3 {
3324

3425
public static void main(String[] args) throws Exception {
35-
if (args.length != 1) {
36-
System.out.println("Usage: create_measurements3.sh <number of records to create>");
26+
if (args.length < 1) {
27+
System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]");
3728
System.exit(1);
3829
}
30+
3931
int size = 0;
4032
try {
4133
size = Integer.parseInt(args[0]);
4234
}
4335
catch (NumberFormatException e) {
4436
System.out.println("Invalid value for <number of records to create>");
45-
System.out.println("Usage: create_measurements3.sh <number of records to create>");
37+
System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]");
4638
System.exit(1);
4739
}
48-
final var weatherStations = generateWeatherStations();
40+
41+
// Default seed is 1brc1brc converted to hexadecimal
42+
long seed = 0x3162726331627263L;
43+
if (args.length == 2) {
44+
try {
45+
seed = Long.parseLong(args[1]);
46+
}
47+
catch (NumberFormatException e) {
48+
System.out.println("Invalid value for [seed]");
49+
System.out.println("Usage: CreateMeasurements2 <number of records to create> [seed]");
50+
System.exit(1);
51+
}
52+
}
53+
54+
final var weatherStations = WeatherStationFactory.getWeatherStationsList(seed);
4955
final var start = System.currentTimeMillis();
50-
final var rnd = ThreadLocalRandom.current();
56+
final var rnd = new FastRandom(seed);
5157
try (var out = new BufferedWriter(new FileWriter("measurements.txt"))) {
5258
for (int i = 1; i <= size; i++) {
53-
var station = weatherStations.get(rnd.nextInt(KEYSET_SIZE));
54-
double temp = rnd.nextGaussian(station.avgTemp, 7.0);
55-
out.write(station.name);
59+
var station = weatherStations.get(rnd.nextInt(weatherStations.size()));
60+
double temp = station.measurement();
61+
out.write(station.id);
5662
out.write(';');
57-
out.write(Double.toString(Math.round(temp * 10.0) / 10.0));
58-
out.newLine();
63+
out.write(Double.toString(temp));
64+
out.write('\n');
5965
if (i % 50_000_000 == 0) {
6066
System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start);
6167
}
6268
}
6369
}
6470
}
65-
66-
record WeatherStation(String name, float avgTemp) {
67-
}
68-
69-
private static ArrayList<WeatherStation> generateWeatherStations() throws Exception {
70-
// Use a public list of city names and concatenate them all into a long string,
71-
// which we'll use as a "source of city name randomness"
72-
var bigName = new StringBuilder(1 << 20);
73-
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"));) {
74-
skipComments(rows);
75-
while (true) {
76-
var row = rows.readLine();
77-
if (row == null) {
78-
break;
79-
}
80-
bigName.append(row, 0, row.indexOf(';'));
81-
}
82-
}
83-
final var weatherStations = new ArrayList<WeatherStation>();
84-
final var names = new HashSet<String>();
85-
var minLen = Integer.MAX_VALUE;
86-
var maxLen = Integer.MIN_VALUE;
87-
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"))) {
88-
skipComments(rows);
89-
final var nameSource = new StringReader(bigName.toString());
90-
final var buf = new char[MAX_NAME_LEN];
91-
final var rnd = ThreadLocalRandom.current();
92-
final double yOffset = 4;
93-
final double factor = 2500;
94-
final double xOffset = 0.372;
95-
final double power = 7;
96-
for (int i = 0; i < KEYSET_SIZE; i++) {
97-
var row = rows.readLine();
98-
if (row == null) {
99-
break;
100-
}
101-
// Use a 7th-order curve to simulate the name length distribution.
102-
// It gives us mostly short names, but with large outliers.
103-
var nameLen = (int) (yOffset + factor * Math.pow(rnd.nextDouble() - xOffset, power));
104-
var count = nameSource.read(buf, 0, nameLen);
105-
if (count == -1) {
106-
throw new Exception("Name source exhausted");
107-
}
108-
var nameBuf = new StringBuilder(nameLen);
109-
nameBuf.append(buf, 0, nameLen);
110-
if (Character.isWhitespace(nameBuf.charAt(0))) {
111-
nameBuf.setCharAt(0, readNonSpace(nameSource));
112-
}
113-
if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) {
114-
nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource));
115-
}
116-
var name = nameBuf.toString();
117-
while (names.contains(name)) {
118-
nameBuf.setCharAt(rnd.nextInt(nameBuf.length()), readNonSpace(nameSource));
119-
name = nameBuf.toString();
120-
}
121-
int actualLen;
122-
while (true) {
123-
actualLen = name.getBytes(StandardCharsets.UTF_8).length;
124-
if (actualLen <= 100) {
125-
break;
126-
}
127-
nameBuf.deleteCharAt(nameBuf.length() - 1);
128-
if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) {
129-
nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource));
130-
}
131-
name = nameBuf.toString();
132-
}
133-
if (name.indexOf(';') != -1) {
134-
throw new Exception("Station name contains a semicolon!");
135-
}
136-
names.add(name);
137-
minLen = Integer.min(minLen, actualLen);
138-
maxLen = Integer.max(maxLen, actualLen);
139-
var lat = Float.parseFloat(row.substring(row.indexOf(';') + 1));
140-
// Guesstimate mean temperature using cosine of latitude
141-
var avgTemp = (float) (30 * Math.cos(Math.toRadians(lat))) - 10;
142-
weatherStations.add(new WeatherStation(name, avgTemp));
143-
}
144-
}
145-
System.out.format("Generated %,d station names with length from %,d to %,d%n", KEYSET_SIZE, minLen, maxLen);
146-
return weatherStations;
147-
}
148-
149-
private static void skipComments(BufferedReader rows) throws IOException {
150-
while (rows.readLine().startsWith("#")) {
151-
}
152-
}
153-
154-
private static char readNonSpace(StringReader nameSource) throws IOException {
155-
while (true) {
156-
var n = nameSource.read();
157-
if (n == -1) {
158-
throw new IOException("Name source exhausted");
159-
}
160-
var ch = (char) n;
161-
if (ch != ' ') {
162-
return ch;
163-
}
164-
}
165-
}
16671
}

0 commit comments

Comments
 (0)